In [1]:
import sys
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from cuml.linear_model import LogisticRegression as cuMLLogisticRegression

print('NOVA_HOME is at', os.getenv('NOVA_HOME'))
sys.path.insert(1, os.getenv('NOVA_HOME'))
%load_ext autoreload
%autoreload 2

from utils import *
NOVA_HOME is at /home/projects/hornsteinlab/Collaboration/NOVA_GAL/NOVA
NOVA_HOME: /home/projects/hornsteinlab/Collaboration/NOVA_GAL/NOVA
In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from cuml.ensemble import RandomForestClassifier as cuRF
from sklearn.ensemble import ExtraTreesClassifier
In [19]:
dataset_config = {
    "path_to_embeddings": "/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen",
    "multiplexed": False,
    "config_fmt": "NIH_UMAP1_DatasetConfig_B{batch}",
    "config_dir": "manuscript/manuscript_figures_data_config",
}
In [25]:
## Baseline
run_baseline_model(
    dataset_config= dataset_config,
    batches=[1, 2, 3],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:19:40 INFO: [load_embeddings] multiplex=False
2025-08-20 16:19:40 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:19:40 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 16:19:40 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
Loading all batches...
2025-08-20 16:19:42 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:19:43 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:19:44 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:19:44 INFO: [load_embeddings] embeddings shape: (115587, 192)
2025-08-20 16:19:44 INFO: [load_embeddings] labels shape: (115587,)
2025-08-20 16:19:44 INFO: [load_embeddings] example label: DAPI_WT_Untreated
2025-08-20 16:19:44 INFO: [load_embeddings] paths shape: (115587,)
2025-08-20 16:19:44 INFO: [load_embeddings] multiplex=False
2025-08-20 16:19:44 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:19:44 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 16:19:44 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-20 16:19:46 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:19:47 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:19:47 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:19:48 INFO: [load_embeddings] embeddings shape: (94059, 192)
2025-08-20 16:19:48 INFO: [load_embeddings] labels shape: (94059,)
2025-08-20 16:19:48 INFO: [load_embeddings] example label: DCP1A_WT_Untreated
2025-08-20 16:19:48 INFO: [load_embeddings] paths shape: (94059,)
2025-08-20 16:19:48 INFO: [load_embeddings] multiplex=False
2025-08-20 16:19:48 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:19:48 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 16:19:48 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-20 16:19:50 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:19:50 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:19:51 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:19:51 INFO: [load_embeddings] embeddings shape: (87130, 192)
2025-08-20 16:19:51 INFO: [load_embeddings] labels shape: (87130,)
2025-08-20 16:19:51 INFO: [load_embeddings] example label: TUJ1_WT_Untreated
2025-08-20 16:19:51 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (115587, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (94059, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      2123
           1       0.99      0.95      0.97      2536
           2       0.94      0.93      0.94      2079
           3       1.00      1.00      1.00     24823
           4       0.89      0.82      0.86      2319
           5       0.86      0.89      0.88      2608
           6       0.98      0.92      0.95      2236
           7       0.98      0.98      0.98      2265
           8       0.98      0.97      0.98      2110
           9       0.93      0.86      0.90      2104
          10       0.93      0.93      0.93      2243
          11       0.95      0.98      0.97      2236
          12       0.98      0.98      0.98      2227
          13       0.95      0.93      0.94      2360
          14       0.96      0.94      0.95      1916
          15       0.95      0.95      0.95      2074
          16       0.92      0.86      0.89      1818
          17       0.86      0.88      0.87      1631
          18       0.94      0.97      0.95      2090
          19       0.98      0.88      0.93      2019
          20       0.88      0.96      0.91      1923
          21       0.78      0.84      0.81      1654
          22       0.89      0.93      0.91      1934
          23       0.93      0.94      0.94      2086
          24       0.96      0.97      0.96      2114
          25       0.99      0.99      0.99     18531

    accuracy                           0.96     94059
   macro avg       0.93      0.93      0.93     94059
weighted avg       0.96      0.96      0.96     94059

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (115587, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (87130, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
              precision    recall  f1-score   support

           0       0.86      0.89      0.87      1850
           1       0.97      0.59      0.73      2044
           2       0.95      0.95      0.95      2332
           3       0.99      1.00      1.00     22599
           4       0.72      0.74      0.73      1901
           5       0.65      0.56      0.60      1492
           6       0.98      0.85      0.91      2095
           7       0.96      0.95      0.95      2384
           8       0.98      0.97      0.97      2145
           9       0.96      0.82      0.89      2358
          10       0.96      0.95      0.96      2340
          11       0.97      0.96      0.96      2095
          12       0.93      0.99      0.96      2085
          13       0.85      0.99      0.92      2117
          14       0.94      0.92      0.93      1751
          15       0.97      0.88      0.92      1855
          16       0.86      0.52      0.65      1623
          17       0.55      0.71      0.62      1903
          18       0.83      0.99      0.90      2085
          19       0.98      0.90      0.94      2152
          20       0.69      0.98      0.81      1857
          21       0.61      0.28      0.38      1484
          22       0.92      0.92      0.92      1836
          23       0.82      0.93      0.87      2078
          24       0.97      0.97      0.97      2200
          25       0.94      0.98      0.96     16469

    accuracy                           0.92     87130
   macro avg       0.88      0.85      0.86     87130
weighted avg       0.92      0.92      0.91     87130


=== Overall Accuracy ===
0.9389480095328129 [0.9595785623916904, 0.9183174566739355]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.995381     0.914926     0.997184 0.879294 0.998091
        CLTC_WT_Untreated  0.994310     0.791048     0.999581 0.979984 0.994608
Calreticulin_WT_Untreated  0.997268     0.943550     0.998608 0.944192 0.998591
        DAPI_WT_Untreated  0.998565     1.000000     0.998056 0.994547 1.000000
       DCP1A_WT_Untreated  0.990717     0.785782     0.995604 0.809966 0.994895
        FMRP_WT_Untreated  0.990320     0.771463     0.995387 0.794724 0.994712
         FUS_WT_Untreated  0.996865     0.887786     0.999536 0.979119 0.997258
       G3BP1_WT_Untreated  0.998195     0.962357     0.999139 0.967142 0.999009
       GM130_WT_Untreated  0.998830     0.968743     0.999554 0.981195 0.999249
       KIF5A_WT_Untreated  0.994928     0.839534     0.998851 0.948595 0.995960
       LAMP1_WT_Untreated  0.997196     0.944141     0.998573 0.944966 0.998550
 MitoTracker_WT_Untreated  0.998278     0.971369     0.998937 0.957224 0.999299
         NCL_WT_Untreated  0.998537     0.985622     0.998852 0.954413 0.999649
        NEMO_WT_Untreated  0.996429     0.960688     0.997335 0.901299 0.999002
         P54_WT_Untreated  0.997610     0.929915     0.999009 0.950920 0.998553
       PEX14_WT_Untreated  0.997362     0.917282     0.999137 0.959276 0.998168
         PML_WT_Untreated  0.992820     0.700378     0.998481 0.899254 0.994224
       PSD95_WT_Untreated  0.988537     0.792869     0.992429 0.675669 0.995865
        PURA_WT_Untreated  0.996440     0.981078     0.996803 0.878593 0.999552
  Phalloidin_WT_Untreated  0.997053     0.887317     0.999638 0.983001 0.997351
        SNCA_WT_Untreated  0.993416     0.968783     0.993941 0.773063 0.999331
      SQSTM1_WT_Untreated  0.989000     0.575207     0.996293 0.732252 0.992542
       TDP43_WT_Untreated  0.996523     0.927056     0.997999 0.907792 0.998449
        TIA1_WT_Untreated  0.995292     0.935399     0.996701 0.869614 0.998478
      TOMM20_WT_Untreated  0.998433     0.967548     0.999186 0.966651 0.999208
        TUJ1_WT_Untreated  0.991169     0.989229     0.991634 0.965882 0.997406
            Macro Average  0.995364     0.896118     0.997556 0.907639 0.997616
Out[25]:
{'Accuracy': 0.99536438151917,
 'Sensitivity': 0.8961180450904996,
 'Specificity': 0.9975556871415043,
 'PPV': 0.9076394777540618,
 'NPV': 0.99761554225463}
In [5]:
## Baseline
run_baseline_model(
    dataset_config= dataset_config,
    batches=[1, 2, 3],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
)
2025-08-19 14:34:37 INFO: [load_embeddings] multiplex=False
2025-08-19 14:34:37 INFO: [load_embeddings] experiment_type = NIH
2025-08-19 14:34:37 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-19 14:34:37 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
Loading all batches...
2025-08-19 14:34:39 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-19 14:34:40 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-19 14:34:41 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-19 14:34:41 INFO: [load_embeddings] embeddings shape: (115587, 192)
2025-08-19 14:34:41 INFO: [load_embeddings] labels shape: (115587,)
2025-08-19 14:34:41 INFO: [load_embeddings] example label: DAPI_WT_Untreated
2025-08-19 14:34:41 INFO: [load_embeddings] paths shape: (115587,)
2025-08-19 14:34:41 INFO: [load_embeddings] multiplex=False
2025-08-19 14:34:41 INFO: [load_embeddings] experiment_type = NIH
2025-08-19 14:34:41 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-19 14:34:41 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-19 14:34:43 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-19 14:34:44 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-19 14:34:45 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-19 14:34:45 INFO: [load_embeddings] embeddings shape: (94059, 192)
2025-08-19 14:34:45 INFO: [load_embeddings] labels shape: (94059,)
2025-08-19 14:34:45 INFO: [load_embeddings] example label: DCP1A_WT_Untreated
2025-08-19 14:34:45 INFO: [load_embeddings] paths shape: (94059,)
2025-08-19 14:34:45 INFO: [load_embeddings] multiplex=False
2025-08-19 14:34:45 INFO: [load_embeddings] experiment_type = NIH
2025-08-19 14:34:45 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-19 14:34:45 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-19 14:34:47 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-19 14:34:48 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-19 14:34:49 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-19 14:34:49 INFO: [load_embeddings] embeddings shape: (87130, 192)
2025-08-19 14:34:49 INFO: [load_embeddings] labels shape: (87130,)
2025-08-19 14:34:49 INFO: [load_embeddings] example label: TUJ1_WT_Untreated
2025-08-19 14:34:49 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [2, 3], Testing on: [1].

=== Batch [1] ===
Train: (181189, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (115587, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DCP1A_WT_Untreated: 4220
SQSTM1_WT_Untreated: 3138
ANAX11_WT_Untreated: 3973
TUJ1_WT_Untreated: 35000
DAPI_WT_Untreated: 47422
Calreticulin_WT_Untreated: 4411
NEMO_WT_Untreated: 4477
SNCA_WT_Untreated: 3780
TOMM20_WT_Untreated: 4314
KIF5A_WT_Untreated: 4462
Phalloidin_WT_Untreated: 4171
PEX14_WT_Untreated: 3929
GM130_WT_Untreated: 4255
MitoTracker_WT_Untreated: 4331
NCL_WT_Untreated: 4312
CLTC_WT_Untreated: 4580
FMRP_WT_Untreated: 4100
PSD95_WT_Untreated: 3534
PML_WT_Untreated: 3441
G3BP1_WT_Untreated: 4649
TDP43_WT_Untreated: 3770
P54_WT_Untreated: 3667
PURA_WT_Untreated: 4175
TIA1_WT_Untreated: 4164
FUS_WT_Untreated: 4331
LAMP1_WT_Untreated: 4583
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      2614
           1       0.95      0.95      0.95      2439
           2       0.94      0.95      0.95      3056
           3       1.00      1.00      1.00     30428
           4       0.89      0.90      0.90      2364
           5       0.89      0.89      0.89      2913
           6       0.96      0.98      0.97      2728
           7       0.98      0.98      0.98      2842
           8       0.97      0.98      0.97      2371
           9       0.79      0.90      0.84      2622
          10       0.96      0.95      0.95      3067
          11       0.99      0.98      0.98      2728
          12       1.00      0.98      0.99      2709
          13       0.96      0.97      0.97      2935
          14       0.97      0.97      0.97      2622
          15       0.94      0.97      0.96      2505
          16       0.83      0.90      0.87      2297
          17       0.89      0.82      0.85      2101
          18       0.99      0.96      0.97      2712
          19       0.88      0.98      0.93      2219
          20       0.96      0.84      0.90      2454
          21       0.83      0.72      0.77      2651
          22       0.95      0.94      0.95      2534
          23       0.94      0.91      0.93      2712
          24       0.97      0.99      0.98      2363
          25       0.99      0.99      0.99     22601

    accuracy                           0.96    115587
   macro avg       0.94      0.94      0.94    115587
weighted avg       0.96      0.96      0.96    115587

Training on Batches: [1, 3], Testing on: [2].

=== Batch [2] ===
Train: (202717, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (94059, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DAPI_WT_Untreated: 53027
TUJ1_WT_Untreated: 39070
LAMP1_WT_Untreated: 5407
DCP1A_WT_Untreated: 4265
TDP43_WT_Untreated: 4370
P54_WT_Untreated: 4373
SNCA_WT_Untreated: 4311
CLTC_WT_Untreated: 4483
PEX14_WT_Untreated: 4360
PURA_WT_Untreated: 4797
G3BP1_WT_Untreated: 5226
Phalloidin_WT_Untreated: 4371
NEMO_WT_Untreated: 5052
SQSTM1_WT_Untreated: 4135
PML_WT_Untreated: 3920
GM130_WT_Untreated: 4516
Calreticulin_WT_Untreated: 5388
KIF5A_WT_Untreated: 4980
FMRP_WT_Untreated: 4405
NCL_WT_Untreated: 4794
TOMM20_WT_Untreated: 4563
FUS_WT_Untreated: 4823
MitoTracker_WT_Untreated: 4823
TIA1_WT_Untreated: 4790
PSD95_WT_Untreated: 4004
ANAX11_WT_Untreated: 4464
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      2123
           1       0.97      0.97      0.97      2536
           2       0.95      0.95      0.95      2079
           3       1.00      1.00      1.00     24823
           4       0.88      0.87      0.87      2319
           5       0.89      0.86      0.87      2608
           6       0.97      0.92      0.94      2236
           7       0.97      0.98      0.97      2265
           8       0.97      0.98      0.98      2110
           9       0.89      0.90      0.89      2104
          10       0.94      0.93      0.93      2243
          11       0.97      0.99      0.98      2236
          12       0.98      0.98      0.98      2227
          13       0.96      0.89      0.93      2360
          14       0.96      0.95      0.95      1916
          15       0.94      0.96      0.95      2074
          16       0.88      0.87      0.87      1818
          17       0.86      0.82      0.84      1631
          18       0.97      0.97      0.97      2090
          19       0.98      0.92      0.95      2019
          20       0.91      0.93      0.92      1923
          21       0.73      0.83      0.77      1654
          22       0.89      0.94      0.91      1934
          23       0.94      0.94      0.94      2086
          24       0.96      0.98      0.97      2114
          25       0.99      0.99      0.99     18531

    accuracy                           0.96     94059
   macro avg       0.93      0.93      0.93     94059
weighted avg       0.96      0.96      0.96     94059

Training on Batches: [1, 2], Testing on: [3].

=== Batch [3] ===
Train: (209646, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (87130, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DAPI_WT_Untreated: 55251
TUJ1_WT_Untreated: 41132
LAMP1_WT_Untreated: 5310
DCP1A_WT_Untreated: 4683
TDP43_WT_Untreated: 4468
P54_WT_Untreated: 4538
SNCA_WT_Untreated: 4377
CLTC_WT_Untreated: 4975
PEX14_WT_Untreated: 4579
PURA_WT_Untreated: 4802
G3BP1_WT_Untreated: 5107
Phalloidin_WT_Untreated: 4238
NEMO_WT_Untreated: 5295
SQSTM1_WT_Untreated: 4305
PML_WT_Untreated: 4115
GM130_WT_Untreated: 4481
Calreticulin_WT_Untreated: 5135
KIF5A_WT_Untreated: 4726
FMRP_WT_Untreated: 5521
NCL_WT_Untreated: 4936
TOMM20_WT_Untreated: 4477
FUS_WT_Untreated: 4964
MitoTracker_WT_Untreated: 4964
TIA1_WT_Untreated: 4798
PSD95_WT_Untreated: 3732
ANAX11_WT_Untreated: 4737
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      1850
           1       0.97      0.62      0.75      2044
           2       0.96      0.97      0.96      2332
           3       0.99      1.00      1.00     22599
           4       0.74      0.80      0.77      1901
           5       0.69      0.57      0.62      1492
           6       0.97      0.89      0.93      2095
           7       0.96      0.95      0.95      2384
           8       0.98      0.98      0.98      2145
           9       0.97      0.90      0.93      2358
          10       0.96      0.96      0.96      2340
          11       0.98      0.96      0.97      2095
          12       0.95      0.99      0.97      2085
          13       0.82      0.99      0.90      2117
          14       0.94      0.94      0.94      1751
          15       0.97      0.88      0.92      1855
          16       0.84      0.55      0.66      1623
          17       0.55      0.72      0.62      1903
          18       0.88      0.99      0.93      2085
          19       0.98      0.93      0.95      2152
          20       0.74      0.97      0.84      1857
          21       0.69      0.25      0.37      1484
          22       0.93      0.94      0.93      1836
          23       0.83      0.94      0.89      2078
          24       0.97      0.97      0.97      2200
          25       0.95      0.99      0.97     16469

    accuracy                           0.93     87130
   macro avg       0.89      0.87      0.87     87130
weighted avg       0.93      0.93      0.92     87130


=== Overall Accuracy ===
0.9499144449661753 [0.9621670257035826, 0.9605247770016692, 0.9270515321932744]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.996651     0.934113     0.998070 0.916580 0.998504
        CLTC_WT_Untreated  0.995879     0.860521     0.999158 0.961171 0.996630
Calreticulin_WT_Untreated  0.997557     0.953127     0.998704 0.949947 0.998790
        DAPI_WT_Untreated  0.999431     0.999961     0.999242 0.997872 0.999986
       DCP1A_WT_Untreated  0.993278     0.861634     0.996265 0.839574 0.996859
        FMRP_WT_Untreated  0.992240     0.809497     0.996663 0.854455 0.995395
         FUS_WT_Untreated  0.997685     0.933418     0.999251 0.968116 0.998379
       G3BP1_WT_Untreated  0.998528     0.971699     0.999222 0.970016 0.999267
       GM130_WT_Untreated  0.998912     0.978117     0.999387 0.973269 0.999500
       KIF5A_WT_Untreated  0.994413     0.898927     0.996748 0.871135 0.997526
       LAMP1_WT_Untreated  0.997409     0.945882     0.998772 0.953234 0.998568
 MitoTracker_WT_Untreated  0.998932     0.975776     0.999496 0.979244 0.999410
         NCL_WT_Untreated  0.999073     0.982766     0.999469 0.978168 0.999582
        NEMO_WT_Untreated  0.996624     0.953724     0.997723 0.914726 0.998813
         P54_WT_Untreated  0.998137     0.954206     0.999088 0.957708 0.999009
       PEX14_WT_Untreated  0.997598     0.942648     0.998815 0.946325 0.998729
         PML_WT_Untreated  0.993274     0.790694     0.997268 0.850900 0.995879
       PSD95_WT_Untreated  0.990666     0.784383     0.994659 0.739749 0.995822
        PURA_WT_Untreated  0.998056     0.969508     0.998734 0.947899 0.999275
  Phalloidin_WT_Untreated  0.997564     0.944757     0.998726 0.942251 0.998784
        SNCA_WT_Untreated  0.995027     0.905839     0.996940 0.863984 0.997978
      SQSTM1_WT_Untreated  0.989160     0.632234     0.996261 0.770851 0.992710
       TDP43_WT_Untreated  0.997119     0.940990     0.998337 0.924708 0.998719
        TIA1_WT_Untreated  0.996155     0.931646     0.997685 0.905186 0.998378
      TOMM20_WT_Untreated  0.998864     0.980081     0.999297 0.969769 0.999541
        TUJ1_WT_Untreated  0.994444     0.991146     0.995238 0.980440 0.997862
            Macro Average  0.996257     0.916434     0.998047 0.920280 0.998073
In [11]:
run_train_test_split_baseline(
    dataset_config,        
    batches=[1],  
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},     
)
2025-08-20 12:30:33 INFO: [load_embeddings] multiplex=False
2025-08-20 12:30:33 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 12:30:33 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 12:30:33 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-20 12:30:35 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 12:30:36 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 12:30:37 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 12:30:37 INFO: [load_embeddings] embeddings shape: (115587, 192)
2025-08-20 12:30:37 INFO: [load_embeddings] labels shape: (115587,)
2025-08-20 12:30:37 INFO: [load_embeddings] example label: DAPI_WT_Untreated
2025-08-20 12:30:37 INFO: [load_embeddings] paths shape: (115587,)
Train dataset
(92469,) (92469, 192) [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
3: 24342
11: 2182
7: 2274
9: 2098
6: 2182
24: 1890
13: 2348
12: 2167
2: 2445
14: 2098
19: 1775
25: 18081
21: 2121
10: 2453
0: 2091
4: 1891
20: 1963
15: 2004
16: 1838
22: 2027
5: 2330
17: 1681
1: 1951
8: 1897
23: 2170
18: 2170
Test dataset
(23118,) (23118, 192) [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
25: 4520
5: 583
3: 6086
23: 542
24: 473
12: 542
9: 524
19: 444
0: 523
11: 546
7: 568
4: 473
1: 488
13: 587
10: 614
22: 507
15: 501
2: 611
17: 420
6: 546
16: 459
20: 491
21: 530
14: 524
18: 542
8: 474
              precision    recall  f1-score   support

           0       0.94      0.95      0.94       523
           1       0.97      0.95      0.96       488
           2       0.95      0.96      0.95       611
           3       1.00      1.00      1.00      6086
           4       0.90      0.91      0.91       473
           5       0.91      0.90      0.91       583
           6       0.98      0.97      0.97       546
           7       0.98      0.99      0.98       568
           8       0.98      0.97      0.97       474
           9       0.89      0.87      0.88       524
          10       0.95      0.95      0.95       614
          11       0.98      0.99      0.98       546
          12       0.99      0.99      0.99       542
          13       0.97      0.97      0.97       587
          14       0.97      0.96      0.97       524
          15       0.95      0.96      0.96       501
          16       0.89      0.89      0.89       459
          17       0.92      0.91      0.92       420
          18       0.97      0.98      0.97       542
          19       0.98      0.94      0.96       444
          20       0.91      0.92      0.92       491
          21       0.86      0.87      0.86       530
          22       0.93      0.95      0.94       507
          23       0.95      0.94      0.94       542
          24       0.97      0.99      0.98       473
          25       0.99      0.99      0.99      4520

    accuracy                           0.97     23118
   macro avg       0.95      0.95      0.95     23118
weighted avg       0.97      0.97      0.97     23118

Accuracy: 0.9695

=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.997405     0.950287     0.998495 0.935970 0.998849
        CLTC_WT_Untreated  0.998356     0.948770     0.999426 0.972689 0.998896
Calreticulin_WT_Untreated  0.997578     0.960720     0.998578 0.948304 0.998933
        DAPI_WT_Untreated  1.000000     1.000000     1.000000 1.000000 1.000000
       DCP1A_WT_Untreated  0.996107     0.911205     0.997880 0.899791 0.998145
        FMRP_WT_Untreated  0.995372     0.903945     0.997737 0.911765 0.997516
         FUS_WT_Untreated  0.998789     0.965201     0.999601 0.983209 0.999159
       G3BP1_WT_Untreated  0.999221     0.989437     0.999468 0.979094 0.999734
       GM130_WT_Untreated  0.998875     0.968354     0.999514 0.976596 0.999338
       KIF5A_WT_Untreated  0.994593     0.868321     0.997521 0.890411 0.996948
       LAMP1_WT_Untreated  0.997578     0.954397     0.998756 0.954397 0.998756
 MitoTracker_WT_Untreated  0.999221     0.987179     0.999513 0.980000 0.999690
         NCL_WT_Untreated  0.999481     0.987085     0.999779 0.990741 0.999690
        NEMO_WT_Untreated  0.998486     0.974446     0.999112 0.966216 0.999334
         P54_WT_Untreated  0.998486     0.958015     0.999425 0.974757 0.999027
       PEX14_WT_Untreated  0.998097     0.958084     0.998983 0.954274 0.999071
         PML_WT_Untreated  0.995718     0.893246     0.997793 0.891304 0.997837
       PSD95_WT_Untreated  0.996972     0.911905     0.998546 0.920673 0.998370
        PURA_WT_Untreated  0.998702     0.976015     0.999247 0.968864 0.999424
  Phalloidin_WT_Untreated  0.998573     0.943694     0.999647 0.981265 0.998898
        SNCA_WT_Untreated  0.996410     0.916497     0.998144 0.914634 0.998188
      SQSTM1_WT_Untreated  0.993771     0.869811     0.996680 0.860075 0.996944
       TDP43_WT_Untreated  0.997318     0.948718     0.998408 0.930368 0.998850
        TIA1_WT_Untreated  0.997361     0.939114     0.998760 0.947858 0.998539
      TOMM20_WT_Untreated  0.999092     0.987315     0.999338 0.968880 0.999735
        TUJ1_WT_Untreated  0.997534     0.994912     0.998172 0.992496 0.998763
            Macro Average  0.997658     0.948718     0.998789 0.949794 0.998794
In [36]:
## Baseline
run_baseline_model(
    dataset_config= dataset_config,
    batches=[1, 2, 3],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=GaussianNB,
    classifier_kwargs={},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:25:56 INFO: [load_embeddings] multiplex=False
2025-08-20 16:25:56 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:25:56 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 16:25:56 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
Loading all batches...
2025-08-20 16:25:59 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:26:00 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:26:00 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:26:01 INFO: [load_embeddings] embeddings shape: (115587, 192)
2025-08-20 16:26:01 INFO: [load_embeddings] labels shape: (115587,)
2025-08-20 16:26:01 INFO: [load_embeddings] example label: DAPI_WT_Untreated
2025-08-20 16:26:01 INFO: [load_embeddings] paths shape: (115587,)
2025-08-20 16:26:01 INFO: [load_embeddings] multiplex=False
2025-08-20 16:26:01 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:26:01 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 16:26:01 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-20 16:26:03 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:26:04 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:26:04 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:26:04 INFO: [load_embeddings] embeddings shape: (94059, 192)
2025-08-20 16:26:04 INFO: [load_embeddings] labels shape: (94059,)
2025-08-20 16:26:04 INFO: [load_embeddings] example label: DCP1A_WT_Untreated
2025-08-20 16:26:04 INFO: [load_embeddings] paths shape: (94059,)
2025-08-20 16:26:04 INFO: [load_embeddings] multiplex=False
2025-08-20 16:26:04 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:26:04 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 16:26:04 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-20 16:26:06 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:26:07 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:26:08 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:26:08 INFO: [load_embeddings] embeddings shape: (87130, 192)
2025-08-20 16:26:08 INFO: [load_embeddings] labels shape: (87130,)
2025-08-20 16:26:08 INFO: [load_embeddings] example label: TUJ1_WT_Untreated
2025-08-20 16:26:08 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (115587, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (94059, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      2123
           1       0.90      0.90      0.90      2536
           2       0.77      0.80      0.78      2079
           3       1.00      1.00      1.00     24823
           4       0.81      0.69      0.74      2319
           5       0.77      0.76      0.76      2608
           6       0.93      0.89      0.91      2236
           7       0.92      0.96      0.94      2265
           8       0.97      0.93      0.95      2110
           9       0.89      0.74      0.81      2104
          10       0.88      0.89      0.88      2243
          11       0.80      0.97      0.87      2236
          12       0.97      0.93      0.95      2227
          13       0.91      0.90      0.90      2360
          14       0.93      0.78      0.85      1916
          15       0.91      0.94      0.92      2074
          16       0.84      0.78      0.81      1818
          17       0.78      0.83      0.80      1631
          18       0.84      0.95      0.89      2090
          19       0.95      0.85      0.89      2019
          20       0.70      0.90      0.79      1923
          21       0.64      0.69      0.66      1654
          22       0.77      0.89      0.83      1934
          23       0.86      0.86      0.86      2086
          24       0.94      0.95      0.94      2114
          25       0.99      0.95      0.97     18531

    accuracy                           0.92     94059
   macro avg       0.86      0.87      0.86     94059
weighted avg       0.92      0.92      0.92     94059

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (115587, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (87130, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
              precision    recall  f1-score   support

           0       0.66      0.90      0.76      1850
           1       0.85      0.69      0.76      2044
           2       0.75      0.85      0.79      2332
           3       1.00      1.00      1.00     22599
           4       0.61      0.58      0.60      1901
           5       0.50      0.43      0.46      1492
           6       0.95      0.86      0.90      2095
           7       0.88      0.93      0.90      2384
           8       0.96      0.93      0.94      2145
           9       0.94      0.72      0.82      2358
          10       0.90      0.92      0.91      2340
          11       0.80      0.90      0.85      2095
          12       0.92      0.95      0.94      2085
          13       0.82      0.97      0.89      2117
          14       0.92      0.80      0.86      1751
          15       0.92      0.76      0.83      1855
          16       0.78      0.44      0.56      1623
          17       0.54      0.65      0.59      1903
          18       0.67      0.96      0.79      2085
          19       0.96      0.83      0.89      2152
          20       0.58      0.97      0.72      1857
          21       0.49      0.27      0.35      1484
          22       0.84      0.88      0.86      1836
          23       0.80      0.84      0.82      2078
          24       0.95      0.95      0.95      2200
          25       0.97      0.91      0.94     16469

    accuracy                           0.88     87130
   macro avg       0.81      0.80      0.80     87130
weighted avg       0.88      0.88      0.88     87130


=== Overall Accuracy ===
0.8964025760808889 [0.9159889005836762, 0.8768162515781017]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.990226     0.910647     0.992010 0.718713 0.997985
        CLTC_WT_Untreated  0.992290     0.803275     0.997192 0.881198 0.994910
Calreticulin_WT_Untreated  0.989304     0.826797     0.993359 0.756482 0.995668
        DAPI_WT_Untreated  0.998841     0.998313     0.999028 0.997262 0.999402
       DCP1A_WT_Untreated  0.985667     0.638863     0.993937 0.715309 0.991410
        FMRP_WT_Untreated  0.984971     0.639756     0.992964 0.677953 0.991670
         FUS_WT_Untreated  0.995739     0.879704     0.998581 0.938193 0.997059
       G3BP1_WT_Untreated  0.995850     0.942353     0.997258 0.900514 0.998480
       GM130_WT_Untreated  0.997511     0.929260     0.999152 0.963450 0.998300
       KIF5A_WT_Untreated  0.991661     0.728821     0.998297 0.915283 0.993188
       LAMP1_WT_Untreated  0.994834     0.903120     0.997214 0.893759 0.997485
 MitoTracker_WT_Untreated  0.992864     0.933964     0.994306 0.800673 0.998376
         NCL_WT_Untreated  0.997356     0.940631     0.998739 0.947885 0.998553
        NEMO_WT_Untreated  0.994724     0.932991     0.996288 0.864267 0.998299
         P54_WT_Untreated  0.994497     0.789746     0.998727 0.927611 0.995670
       PEX14_WT_Untreated  0.995027     0.854416     0.998144 0.910743 0.996778
         PML_WT_Untreated  0.990143     0.619587     0.997316 0.817171 0.992670
       PSD95_WT_Untreated  0.986738     0.733447     0.991776 0.639526 0.994682
        PURA_WT_Untreated  0.991291     0.953772     0.992176 0.741941 0.998902
  Phalloidin_WT_Untreated  0.995347     0.840086     0.999006 0.952174 0.996242
        SNCA_WT_Untreated  0.987196     0.934656     0.988315 0.630218 0.998593
      SQSTM1_WT_Untreated  0.985369     0.490121     0.994097 0.594052 0.991041
       TDP43_WT_Untreated  0.993123     0.887268     0.995373 0.802928 0.997599
        TIA1_WT_Untreated  0.992555     0.849424     0.995921 0.830477 0.996456
      TOMM20_WT_Untreated  0.997389     0.948540     0.998581 0.942206 0.998745
        TUJ1_WT_Untreated  0.983790     0.932200     0.996142 0.983007 0.983966
            Macro Average  0.992089     0.840068     0.995919 0.836269 0.995851
Out[36]:
{'Accuracy': 0.9920885840836833,
 'Sensitivity': 0.8400676286360618,
 'Specificity': 0.9959191889847439,
 'PPV': 0.8362690088380957,
 'NPV': 0.9958511944497294}
In [37]:
## Baseline
run_baseline_model(
    dataset_config= dataset_config,
    batches=[1, 2, 3],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=RidgeClassifier,
    classifier_kwargs={},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:26:20 INFO: [load_embeddings] multiplex=False
2025-08-20 16:26:20 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:26:20 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 16:26:20 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
Loading all batches...
2025-08-20 16:26:23 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:26:24 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:26:25 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:26:25 INFO: [load_embeddings] embeddings shape: (115587, 192)
2025-08-20 16:26:25 INFO: [load_embeddings] labels shape: (115587,)
2025-08-20 16:26:25 INFO: [load_embeddings] example label: DAPI_WT_Untreated
2025-08-20 16:26:25 INFO: [load_embeddings] paths shape: (115587,)
2025-08-20 16:26:25 INFO: [load_embeddings] multiplex=False
2025-08-20 16:26:25 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:26:25 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 16:26:25 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-20 16:26:27 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:26:28 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:26:28 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:26:29 INFO: [load_embeddings] embeddings shape: (94059, 192)
2025-08-20 16:26:29 INFO: [load_embeddings] labels shape: (94059,)
2025-08-20 16:26:29 INFO: [load_embeddings] example label: DCP1A_WT_Untreated
2025-08-20 16:26:29 INFO: [load_embeddings] paths shape: (94059,)
2025-08-20 16:26:29 INFO: [load_embeddings] multiplex=False
2025-08-20 16:26:29 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:26:29 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 16:26:29 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-20 16:26:31 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:26:31 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:26:32 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:26:32 INFO: [load_embeddings] embeddings shape: (87130, 192)
2025-08-20 16:26:32 INFO: [load_embeddings] labels shape: (87130,)
2025-08-20 16:26:32 INFO: [load_embeddings] example label: TUJ1_WT_Untreated
2025-08-20 16:26:32 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (115587, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (94059, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      2123
           1       0.98      0.77      0.87      2536
           2       0.79      0.85      0.82      2079
           3       1.00      1.00      1.00     24823
           4       0.88      0.74      0.80      2319
           5       0.79      0.86      0.82      2608
           6       0.95      0.90      0.93      2236
           7       0.96      0.96      0.96      2265
           8       0.96      0.95      0.96      2110
           9       0.88      0.82      0.85      2104
          10       0.91      0.89      0.90      2243
          11       0.85      0.97      0.91      2236
          12       0.98      0.94      0.96      2227
          13       0.89      0.93      0.91      2360
          14       0.96      0.88      0.92      1916
          15       0.92      0.97      0.94      2074
          16       0.93      0.76      0.84      1818
          17       0.89      0.77      0.82      1631
          18       0.89      0.97      0.93      2090
          19       0.97      0.71      0.82      2019
          20       0.85      0.91      0.88      1923
          21       0.77      0.69      0.73      1654
          22       0.83      0.92      0.87      1934
          23       0.92      0.90      0.91      2086
          24       0.94      0.94      0.94      2114
          25       0.94      0.99      0.97     18531

    accuracy                           0.93     94059
   macro avg       0.90      0.88      0.89     94059
weighted avg       0.93      0.93      0.93     94059

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (115587, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (87130, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
              precision    recall  f1-score   support

           0       0.81      0.89      0.85      1850
           1       0.95      0.19      0.31      2044
           2       0.87      0.90      0.88      2332
           3       0.99      1.00      0.99     22599
           4       0.65      0.61      0.63      1901
           5       0.49      0.46      0.48      1492
           6       0.95      0.83      0.89      2095
           7       0.94      0.93      0.93      2384
           8       0.94      0.94      0.94      2145
           9       0.94      0.77      0.84      2358
          10       0.93      0.92      0.92      2340
          11       0.88      0.96      0.91      2095
          12       0.96      0.96      0.96      2085
          13       0.76      0.99      0.86      2117
          14       0.92      0.87      0.90      1751
          15       0.94      0.91      0.93      1855
          16       0.87      0.45      0.59      1623
          17       0.56      0.61      0.58      1903
          18       0.77      0.98      0.86      2085
          19       0.97      0.75      0.85      2152
          20       0.65      0.89      0.75      1857
          21       0.66      0.27      0.38      1484
          22       0.87      0.91      0.89      1836
          23       0.84      0.88      0.86      2078
          24       0.94      0.95      0.95      2200
          25       0.88      0.98      0.93     16469

    accuracy                           0.89     87130
   macro avg       0.84      0.80      0.80     87130
weighted avg       0.89      0.89      0.88     87130


=== Overall Accuracy ===
0.9085961175275523 [0.9313090719654685, 0.8858831630896362]
Exception ignored in: <cyfunction RandomForestClassifier.__del__ at 0x14bc241b8a00>
Traceback (most recent call last):
  File "randomforestclassifier.pyx", line 317, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__
  File "randomforestclassifier.pyx", line 321, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data
  File "base.pyx", line 330, in cuml.internals.base.Base.__getattr__
AttributeError: rf_forest
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.993614     0.903851     0.995627 0.822492 0.997840
        CLTC_WT_Untreated  0.987345     0.511354     0.999689 0.977055 0.987483
Calreticulin_WT_Untreated  0.992671     0.876672     0.995565 0.831434 0.996919
        DAPI_WT_Untreated  0.997577     1.000000     0.996718 0.990828 1.000000
       DCP1A_WT_Untreated  0.987913     0.684597     0.995146 0.770811 0.992499
        FMRP_WT_Untreated  0.986219     0.717317     0.992444 0.687310 0.993449
         FUS_WT_Untreated  0.995783     0.865851     0.998965 0.953471 0.996722
       G3BP1_WT_Untreated  0.997279     0.944504     0.998669 0.949200 0.998539
       GM130_WT_Untreated  0.997687     0.947826     0.998887 0.953428 0.998745
       KIF5A_WT_Untreated  0.992886     0.789556     0.998020 0.909631 0.994704
       LAMP1_WT_Untreated  0.995601     0.906830     0.997905 0.918250 0.997583
 MitoTracker_WT_Untreated  0.995563     0.965828     0.996291 0.864435 0.999161
         NCL_WT_Untreated  0.998162     0.950603     0.999322 0.971557 0.998796
        NEMO_WT_Untreated  0.993808     0.958008     0.994715 0.821176 0.998932
         P54_WT_Untreated  0.996330     0.875375     0.998828 0.939146 0.997429
       PEX14_WT_Untreated  0.997207     0.941206     0.998449 0.930783 0.998697
         PML_WT_Untreated  0.991517     0.614066     0.998824 0.909991 0.992575
       PSD95_WT_Untreated  0.987875     0.680249     0.993994 0.692596 0.993642
        PURA_WT_Untreated  0.994624     0.977006     0.995040 0.822877 0.999455
  Phalloidin_WT_Untreated  0.993394     0.734836     0.999486 0.971166 0.993788
        SNCA_WT_Untreated  0.991241     0.898942     0.993208 0.738214 0.997837
      SQSTM1_WT_Untreated  0.988150     0.491077     0.996911 0.736968 0.991083
       TDP43_WT_Untreated  0.994801     0.916711     0.996460 0.846229 0.998227
        TIA1_WT_Untreated  0.994525     0.888569     0.997017 0.875118 0.997378
      TOMM20_WT_Untreated  0.997373     0.945526     0.998637 0.944213 0.998671
        TUJ1_WT_Untreated  0.979784     0.987600     0.977912 0.914565 0.996973
            Macro Average  0.993036     0.845152     0.996259 0.874729 0.996428
Out[37]:
{'Accuracy': 0.9930357463541044,
 'Sensitivity': 0.8451522565708357,
 'Specificity': 0.9962587823367731,
 'PPV': 0.8747285842320065,
 'NPV': 0.9964279364711918}
In [38]:
## Baseline
run_baseline_model(
    dataset_config= dataset_config,
    batches=[1, 2, 3],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=LinearSVC,
    classifier_kwargs={"C": 1.0, "max_iter": 1000, "random_state": 42},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:26:37 INFO: [load_embeddings] multiplex=False
2025-08-20 16:26:37 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:26:37 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 16:26:37 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
Loading all batches...
2025-08-20 16:26:39 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:26:40 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:26:41 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:26:41 INFO: [load_embeddings] embeddings shape: (115587, 192)
2025-08-20 16:26:41 INFO: [load_embeddings] labels shape: (115587,)
2025-08-20 16:26:41 INFO: [load_embeddings] example label: DAPI_WT_Untreated
2025-08-20 16:26:41 INFO: [load_embeddings] paths shape: (115587,)
2025-08-20 16:26:41 INFO: [load_embeddings] multiplex=False
2025-08-20 16:26:41 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:26:41 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 16:26:41 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-20 16:26:43 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:26:44 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:26:45 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:26:45 INFO: [load_embeddings] embeddings shape: (94059, 192)
2025-08-20 16:26:45 INFO: [load_embeddings] labels shape: (94059,)
2025-08-20 16:26:45 INFO: [load_embeddings] example label: DCP1A_WT_Untreated
2025-08-20 16:26:45 INFO: [load_embeddings] paths shape: (94059,)
2025-08-20 16:26:45 INFO: [load_embeddings] multiplex=False
2025-08-20 16:26:45 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:26:45 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 16:26:45 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-20 16:26:47 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:26:48 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:26:48 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:26:48 INFO: [load_embeddings] embeddings shape: (87130, 192)
2025-08-20 16:26:48 INFO: [load_embeddings] labels shape: (87130,)
2025-08-20 16:26:48 INFO: [load_embeddings] example label: TUJ1_WT_Untreated
2025-08-20 16:26:48 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (115587, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (94059, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.
  warnings.warn(
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      2123
           1       0.99      0.97      0.98      2536
           2       0.96      0.96      0.96      2079
           3       1.00      1.00      1.00     24823
           4       0.92      0.86      0.89      2319
           5       0.89      0.91      0.90      2608
           6       0.99      0.94      0.96      2236
           7       0.98      0.98      0.98      2265
           8       0.98      0.98      0.98      2110
           9       0.94      0.88      0.91      2104
          10       0.95      0.94      0.95      2243
          11       0.97      0.99      0.98      2236
          12       0.99      1.00      0.99      2227
          13       0.97      0.94      0.95      2360
          14       0.96      0.95      0.96      1916
          15       0.96      0.97      0.97      2074
          16       0.93      0.88      0.91      1818
          17       0.87      0.89      0.88      1631
          18       0.95      0.98      0.97      2090
          19       0.99      0.91      0.95      2019
          20       0.90      0.97      0.94      1923
          21       0.79      0.84      0.81      1654
          22       0.91      0.94      0.93      1934
          23       0.93      0.95      0.94      2086
          24       0.97      0.98      0.97      2114
          25       0.99      1.00      0.99     18531

    accuracy                           0.97     94059
   macro avg       0.95      0.95      0.95     94059
weighted avg       0.97      0.97      0.97     94059

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (115587, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (87130, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.
  warnings.warn(
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      1850
           1       0.97      0.61      0.75      2044
           2       0.96      0.97      0.97      2332
           3       1.00      1.00      1.00     22599
           4       0.74      0.77      0.75      1901
           5       0.67      0.58      0.62      1492
           6       0.98      0.90      0.94      2095
           7       0.95      0.97      0.96      2384
           8       0.97      0.98      0.98      2145
           9       0.97      0.83      0.90      2358
          10       0.96      0.96      0.96      2340
          11       0.98      0.96      0.97      2095
          12       0.96      0.99      0.98      2085
          13       0.88      1.00      0.94      2117
          14       0.93      0.94      0.94      1751
          15       0.97      0.90      0.94      1855
          16       0.87      0.55      0.68      1623
          17       0.55      0.73      0.63      1903
          18       0.86      0.99      0.92      2085
          19       0.98      0.92      0.95      2152
          20       0.72      0.98      0.83      1857
          21       0.58      0.26      0.36      1484
          22       0.94      0.92      0.93      1836
          23       0.82      0.95      0.88      2078
          24       0.97      0.98      0.97      2200
          25       0.95      0.99      0.97     16469

    accuracy                           0.93     87130
   macro avg       0.89      0.87      0.87     87130
weighted avg       0.93      0.93      0.92     87130


=== Overall Accuracy ===
0.9474422945683487 [0.9680625990070062, 0.9268219901296912]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.996335     0.932796     0.997760 0.903242 0.998492
        CLTC_WT_Untreated  0.994867     0.811572     0.999621 0.982294 0.995135
Calreticulin_WT_Untreated  0.998162     0.964861     0.998993 0.959856 0.999123
        DAPI_WT_Untreated  0.999349     1.000000     0.999118 0.997518 1.000000
       DCP1A_WT_Untreated  0.991931     0.818246     0.996073 0.832449 0.995668
        FMRP_WT_Untreated  0.991324     0.792927     0.995917 0.818067 0.995209
         FUS_WT_Untreated  0.997798     0.921727     0.999661 0.985192 0.998086
       G3BP1_WT_Untreated  0.998438     0.976554     0.999014 0.963089 0.999382
       GM130_WT_Untreated  0.999023     0.980964     0.999457 0.977518 0.999542
       KIF5A_WT_Untreated  0.995452     0.855670     0.998981 0.954977 0.996366
       LAMP1_WT_Untreated  0.997715     0.952215     0.998896 0.957227 0.998760
 MitoTracker_WT_Untreated  0.998852     0.978065     0.999361 0.974017 0.999463
         NCL_WT_Untreated  0.999299     0.994666     0.999412 0.976326 0.999870
        NEMO_WT_Untreated  0.997218     0.966719     0.997991 0.924194 0.999156
         P54_WT_Untreated  0.997842     0.945460     0.998924 0.947786 0.998873
       PEX14_WT_Untreated  0.997997     0.937643     0.999334 0.968964 0.998619
         PML_WT_Untreated  0.993460     0.727695     0.998605 0.909884 0.994749
       PSD95_WT_Untreated  0.988658     0.805603     0.992300 0.675445 0.996118
        PURA_WT_Untreated  0.997285     0.987066     0.997526 0.903926 0.999694
  Phalloidin_WT_Untreated  0.997687     0.913210     0.999678 0.985256 0.997959
        SNCA_WT_Untreated  0.994464     0.973545     0.994910 0.802967 0.999434
      SQSTM1_WT_Untreated  0.988885     0.563735     0.996377 0.732809 0.992342
       TDP43_WT_Untreated  0.997003     0.932361     0.998377 0.924270 0.998562
        TIA1_WT_Untreated  0.995684     0.951249     0.996729 0.872467 0.998851
      TOMM20_WT_Untreated  0.998697     0.976356     0.999242 0.969167 0.999423
        TUJ1_WT_Untreated  0.993035     0.991257     0.993461 0.973184 0.997897
            Macro Average  0.996018     0.909699     0.997912 0.918157 0.997953
Out[38]:
{'Accuracy': 0.9960177579127958,
 'Sensitivity': 0.9096985119886605,
 'Specificity': 0.9979122458246539,
 'PPV': 0.9181573382480437,
 'NPV': 0.997952869605}
In [39]:
run_baseline_model(
    dataset_config,
    batches=[1,2,3,],
    classifier_class=cuRF,
    classifier_kwargs={"n_estimators": 300, "random_state": 42},  # max_depth=0 => unlimited in cuML
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:27:33 INFO: [load_embeddings] multiplex=False
2025-08-20 16:27:33 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:27:33 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 16:27:33 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
Loading all batches...
2025-08-20 16:27:36 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:27:37 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:27:37 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:27:38 INFO: [load_embeddings] embeddings shape: (115587, 192)
2025-08-20 16:27:38 INFO: [load_embeddings] labels shape: (115587,)
2025-08-20 16:27:38 INFO: [load_embeddings] example label: DAPI_WT_Untreated
2025-08-20 16:27:38 INFO: [load_embeddings] paths shape: (115587,)
2025-08-20 16:27:38 INFO: [load_embeddings] multiplex=False
2025-08-20 16:27:38 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:27:38 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 16:27:38 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-20 16:27:40 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:27:41 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:27:41 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:27:42 INFO: [load_embeddings] embeddings shape: (94059, 192)
2025-08-20 16:27:42 INFO: [load_embeddings] labels shape: (94059,)
2025-08-20 16:27:42 INFO: [load_embeddings] example label: DCP1A_WT_Untreated
2025-08-20 16:27:42 INFO: [load_embeddings] paths shape: (94059,)
2025-08-20 16:27:42 INFO: [load_embeddings] multiplex=False
2025-08-20 16:27:42 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:27:42 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 16:27:42 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-20 16:27:44 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:27:45 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:27:45 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:27:46 INFO: [load_embeddings] embeddings shape: (87130, 192)
2025-08-20 16:27:46 INFO: [load_embeddings] labels shape: (87130,)
2025-08-20 16:27:46 INFO: [load_embeddings] example label: TUJ1_WT_Untreated
2025-08-20 16:27:46 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (115587, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (94059, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/cuml/internals/api_decorators.py:344: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
  return func(**kwargs)
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      2123
           1       0.98      0.91      0.95      2536
           2       0.92      0.90      0.91      2079
           3       1.00      1.00      1.00     24823
           4       0.85      0.77      0.81      2319
           5       0.80      0.86      0.83      2608
           6       0.98      0.91      0.95      2236
           7       0.97      0.97      0.97      2265
           8       0.98      0.97      0.98      2110
           9       0.93      0.82      0.87      2104
          10       0.91      0.93      0.92      2243
          11       0.92      0.99      0.95      2236
          12       0.98      0.99      0.98      2227
          13       0.95      0.93      0.94      2360
          14       0.95      0.89      0.92      1916
          15       0.94      0.95      0.94      2074
          16       0.93      0.84      0.88      1818
          17       0.86      0.86      0.86      1631
          18       0.92      0.98      0.95      2090
          19       0.99      0.83      0.90      2019
          20       0.82      0.94      0.87      1923
          21       0.74      0.78      0.76      1654
          22       0.85      0.92      0.88      1934
          23       0.91      0.92      0.92      2086
          24       0.97      0.96      0.96      2114
          25       0.98      0.99      0.99     18531

    accuracy                           0.95     94059
   macro avg       0.92      0.91      0.91     94059
weighted avg       0.95      0.95      0.95     94059

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (115587, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (87130, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/cuml/internals/api_decorators.py:344: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
  return func(**kwargs)
              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1850
           1       0.96      0.43      0.59      2044
           2       0.91      0.94      0.93      2332
           3       0.99      1.00      1.00     22599
           4       0.66      0.68      0.67      1901
           5       0.54      0.51      0.52      1492
           6       0.98      0.87      0.92      2095
           7       0.94      0.93      0.94      2384
           8       0.97      0.97      0.97      2145
           9       0.95      0.77      0.85      2358
          10       0.94      0.95      0.95      2340
          11       0.94      0.95      0.94      2095
          12       0.94      0.99      0.97      2085
          13       0.84      0.99      0.91      2117
          14       0.94      0.88      0.91      1751
          15       0.97      0.87      0.91      1855
          16       0.86      0.49      0.63      1623
          17       0.56      0.68      0.61      1903
          18       0.79      0.99      0.88      2085
          19       0.99      0.84      0.91      2152
          20       0.66      0.97      0.79      1857
          21       0.62      0.29      0.39      1484
          22       0.89      0.92      0.91      1836
          23       0.82      0.91      0.86      2078
          24       0.97      0.96      0.97      2200
          25       0.92      0.98      0.95     16469

    accuracy                           0.91     87130
   macro avg       0.86      0.83      0.84     87130
weighted avg       0.91      0.91      0.90     87130


=== Overall Accuracy ===
0.9268782814899591 [0.9486492520651931, 0.9051073109147251]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.994591     0.914422     0.996389 0.850222 0.998078
        CLTC_WT_Untreated  0.991953     0.697380     0.999592 0.977955 0.992210
Calreticulin_WT_Untreated  0.995960     0.921333     0.997822 0.913464 0.998037
        DAPI_WT_Untreated  0.999073     0.999979     0.998752 0.996491 0.999993
       DCP1A_WT_Untreated  0.988244     0.731043     0.994378 0.756127 0.993592
        FMRP_WT_Untreated  0.987218     0.728780     0.993201 0.712786 0.993717
         FUS_WT_Untreated  0.997003     0.892173     0.999570 0.980711 0.997365
       G3BP1_WT_Untreated  0.997511     0.950097     0.998759 0.952761 0.998686
       GM130_WT_Untreated  0.998653     0.968508     0.999378 0.974001 0.999243
       KIF5A_WT_Untreated  0.993675     0.791573     0.998778 0.942369 0.994759
       LAMP1_WT_Untreated  0.996584     0.941523     0.998013 0.924775 0.998482
 MitoTracker_WT_Untreated  0.997434     0.966290     0.998196 0.929174 0.999174
         NCL_WT_Untreated  0.998786     0.988173     0.999045 0.961851 0.999711
        NEMO_WT_Untreated  0.996131     0.957338     0.997114 0.893661 0.998917
         P54_WT_Untreated  0.996655     0.887647     0.998907 0.943752 0.997682
       PEX14_WT_Untreated  0.997064     0.910664     0.998979 0.951849 0.998022
         PML_WT_Untreated  0.992527     0.675385     0.998667 0.907458 0.993747
       PSD95_WT_Untreated  0.988559     0.758630     0.993133 0.687260 0.995189
        PURA_WT_Untreated  0.995590     0.982994     0.995887 0.849338 0.999597
  Phalloidin_WT_Untreated  0.995982     0.837209     0.999723 0.986162 0.996178
        SNCA_WT_Untreated  0.991821     0.956085     0.992582 0.733063 0.999058
      SQSTM1_WT_Untreated  0.988222     0.546845     0.996001 0.706755 0.992045
       TDP43_WT_Untreated  0.995447     0.920955     0.997030 0.868217 0.998318
        TIA1_WT_Untreated  0.994812     0.915466     0.996678 0.866364 0.998009
      TOMM20_WT_Untreated  0.998273     0.958507     0.999242 0.968611 0.998988
        TUJ1_WT_Untreated  0.987654     0.986914     0.987831 0.951020 0.996839
            Macro Average  0.994439     0.876381     0.997063 0.891777 0.997140
Out[39]:
{'Accuracy': 0.9944392956441148,
 'Sensitivity': 0.8763812358081148,
 'Specificity': 0.9970633461347483,
 'PPV': 0.8917767791645945,
 'NPV': 0.9971398095115489}
In [40]:
run_baseline_model(dataset_config,
    batches=[1,2,3],
    classifier_class=ExtraTreesClassifier,
    classifier_kwargs={"n_estimators": 300, "max_depth": None, "min_samples_leaf": 1,
                                     "n_jobs": -1, "random_state": 42},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:28:08 INFO: [load_embeddings] multiplex=False
2025-08-20 16:28:08 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:28:08 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 16:28:08 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
Loading all batches...
2025-08-20 16:28:10 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:28:11 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:28:12 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:28:13 INFO: [load_embeddings] embeddings shape: (115587, 192)
2025-08-20 16:28:13 INFO: [load_embeddings] labels shape: (115587,)
2025-08-20 16:28:13 INFO: [load_embeddings] example label: DAPI_WT_Untreated
2025-08-20 16:28:13 INFO: [load_embeddings] paths shape: (115587,)
2025-08-20 16:28:13 INFO: [load_embeddings] multiplex=False
2025-08-20 16:28:13 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:28:13 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 16:28:13 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-20 16:28:15 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:28:15 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:28:16 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:28:16 INFO: [load_embeddings] embeddings shape: (94059, 192)
2025-08-20 16:28:16 INFO: [load_embeddings] labels shape: (94059,)
2025-08-20 16:28:16 INFO: [load_embeddings] example label: DCP1A_WT_Untreated
2025-08-20 16:28:16 INFO: [load_embeddings] paths shape: (94059,)
2025-08-20 16:28:16 INFO: [load_embeddings] multiplex=False
2025-08-20 16:28:16 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:28:16 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 16:28:16 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
2025-08-20 16:28:18 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:28:19 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:28:19 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:28:20 INFO: [load_embeddings] embeddings shape: (87130, 192)
2025-08-20 16:28:20 INFO: [load_embeddings] labels shape: (87130,)
2025-08-20 16:28:20 INFO: [load_embeddings] example label: TUJ1_WT_Untreated
2025-08-20 16:28:20 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (115587, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (94059, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      2123
           1       0.99      0.92      0.95      2536
           2       0.91      0.92      0.92      2079
           3       1.00      1.00      1.00     24823
           4       0.88      0.81      0.84      2319
           5       0.84      0.87      0.86      2608
           6       0.98      0.93      0.95      2236
           7       0.97      0.97      0.97      2265
           8       0.98      0.98      0.98      2110
           9       0.93      0.86      0.89      2104
          10       0.92      0.93      0.92      2243
          11       0.93      0.99      0.96      2236
          12       0.98      0.99      0.98      2227
          13       0.95      0.94      0.94      2360
          14       0.96      0.91      0.93      1916
          15       0.94      0.96      0.95      2074
          16       0.94      0.86      0.90      1818
          17       0.87      0.87      0.87      1631
          18       0.92      0.98      0.95      2090
          19       0.99      0.83      0.90      2019
          20       0.85      0.95      0.90      1923
          21       0.78      0.79      0.79      1654
          22       0.87      0.93      0.90      1934
          23       0.93      0.93      0.93      2086
          24       0.96      0.96      0.96      2114
          25       0.98      0.99      0.99     18531

    accuracy                           0.95     94059
   macro avg       0.93      0.92      0.93     94059
weighted avg       0.96      0.95      0.95     94059

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (115587, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (87130, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
              precision    recall  f1-score   support

           0       0.85      0.92      0.88      1850
           1       0.97      0.42      0.58      2044
           2       0.91      0.96      0.93      2332
           3       0.99      1.00      1.00     22599
           4       0.68      0.72      0.70      1901
           5       0.61      0.54      0.57      1492
           6       0.98      0.88      0.93      2095
           7       0.94      0.94      0.94      2384
           8       0.97      0.97      0.97      2145
           9       0.97      0.82      0.89      2358
          10       0.95      0.95      0.95      2340
          11       0.94      0.95      0.95      2095
          12       0.95      0.99      0.97      2085
          13       0.85      0.99      0.91      2117
          14       0.95      0.90      0.92      1751
          15       0.96      0.88      0.92      1855
          16       0.88      0.52      0.65      1623
          17       0.57      0.68      0.62      1903
          18       0.80      0.99      0.89      2085
          19       0.99      0.84      0.91      2152
          20       0.68      0.97      0.80      1857
          21       0.68      0.29      0.41      1484
          22       0.91      0.93      0.92      1836
          23       0.85      0.91      0.88      2078
          24       0.97      0.97      0.97      2200
          25       0.92      0.98      0.95     16469

    accuracy                           0.91     87130
   macro avg       0.87      0.84      0.85     87130
weighted avg       0.91      0.91      0.91     87130


=== Overall Accuracy ===
0.9330128509768199 [0.9547092782189902, 0.9113164237346494]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.995535     0.931034     0.996981 0.873642 0.998452
        CLTC_WT_Untreated  0.991959     0.694105     0.999683 0.982689 0.992127
Calreticulin_WT_Untreated  0.996330     0.938336     0.997777 0.913283 0.998460
        DAPI_WT_Untreated  0.999073     0.999979     0.998752 0.996491 0.999993
       DCP1A_WT_Untreated  0.989696     0.768483     0.994971 0.784660 0.994482
        FMRP_WT_Untreated  0.989149     0.751707     0.994647 0.764764 0.994254
         FUS_WT_Untreated  0.997318     0.904179     0.999599 0.982192 0.997658
       G3BP1_WT_Untreated  0.997699     0.956765     0.998776 0.953688 0.998861
       GM130_WT_Untreated  0.998758     0.974618     0.999339 0.972561 0.999390
       KIF5A_WT_Untreated  0.994840     0.834827     0.998880 0.949528 0.995842
       LAMP1_WT_Untreated  0.996777     0.939996     0.998250 0.933073 0.998443
 MitoTracker_WT_Untreated  0.997710     0.969060     0.998411 0.937249 0.999242
         NCL_WT_Untreated  0.998935     0.989100     0.999175 0.966901 0.999734
        NEMO_WT_Untreated  0.996402     0.965602     0.997182 0.896702 0.999127
         P54_WT_Untreated  0.997158     0.904281     0.999076 0.952874 0.998025
       PEX14_WT_Untreated  0.997235     0.919827     0.998951 0.951053 0.998224
         PML_WT_Untreated  0.993140     0.697472     0.998864 0.922367 0.994171
       PSD95_WT_Untreated  0.988835     0.769949     0.993189 0.692190 0.995413
        PURA_WT_Untreated  0.995943     0.985150     0.996198 0.859382 0.999649
  Phalloidin_WT_Untreated  0.995993     0.835291     0.999780 0.988930 0.996133
        SNCA_WT_Untreated  0.992649     0.960317     0.993337 0.754364 0.999150
      SQSTM1_WT_Untreated  0.989144     0.555449     0.996787 0.752916 0.992201
       TDP43_WT_Untreated  0.996120     0.932626     0.997469 0.886759 0.998567
        TIA1_WT_Untreated  0.995414     0.920029     0.997187 0.884962 0.998117
      TOMM20_WT_Untreated  0.998333     0.962911     0.999197 0.966946 0.999095
        TUJ1_WT_Untreated  0.987543     0.988229     0.987379 0.949359 0.997154
            Macro Average  0.994911     0.886512     0.997301 0.902674 0.997383
Out[40]:
{'Accuracy': 0.9949109663220342,
 'Sensitivity': 0.8865124718026224,
 'Specificity': 0.9973013872850368,
 'PPV': 0.9026740505806359,
 'NPV': 0.9973831966631587}

Cytoself¶

In [41]:
Cytoself_dataset_config = {
    "path_to_embeddings": "/home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/",
    "multiplexed": False,
    "config_fmt": "NIH_UMAP1_DatasetConfig_B{batch}",
    "config_dir": "manuscript/manuscript_figures_data_config",
}
In [42]:
## Baseline
run_baseline_model(
    dataset_config= Cytoself_dataset_config,
    batches=[1, 2, 3],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:32:51 INFO: [load_embeddings] multiplex=False
2025-08-20 16:32:51 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:32:51 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 16:32:51 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
Loading all batches...
2025-08-20 16:32:58 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:33:01 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:33:02 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:33:03 INFO: [load_embeddings] embeddings shape: (112878, 2048)
2025-08-20 16:33:03 INFO: [load_embeddings] labels shape: (112878,)
2025-08-20 16:33:03 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:33:03 INFO: [load_embeddings] paths shape: (112878,)
2025-08-20 16:33:03 INFO: [load_embeddings] multiplex=False
2025-08-20 16:33:03 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:33:03 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 16:33:03 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 16:33:09 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:33:11 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:33:13 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:33:13 INFO: [load_embeddings] embeddings shape: (91973, 2048)
2025-08-20 16:33:13 INFO: [load_embeddings] labels shape: (91973,)
2025-08-20 16:33:13 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:33:13 INFO: [load_embeddings] paths shape: (91973,)
2025-08-20 16:33:13 INFO: [load_embeddings] multiplex=False
2025-08-20 16:33:13 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:33:13 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 16:33:13 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 16:33:19 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:33:21 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:33:23 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:33:23 INFO: [load_embeddings] embeddings shape: (85052, 2048)
2025-08-20 16:33:23 INFO: [load_embeddings] labels shape: (85052,)
2025-08-20 16:33:23 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:33:23 INFO: [load_embeddings] paths shape: (85052,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (112878, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (91973, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
[W] [16:33:37.459943] L-BFGS: max iterations reached
[W] [16:33:37.464982] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      2123
           1       0.98      0.97      0.97      2536
           2       0.98      0.96      0.97      2079
           3       1.00      1.00      1.00     24823
           4       0.87      0.78      0.82      2319
           5       0.81      0.85      0.83      2608
           6       0.96      0.94      0.95      2236
           7       0.97      0.98      0.97      2265
           8       0.94      0.95      0.94      2110
           9       0.86      0.82      0.84      2104
          10       0.95      0.97      0.96      2243
          11       0.98      0.99      0.99      2236
          12       0.99      0.99      0.99      2227
          13       0.96      0.88      0.92      2360
          14       0.94      0.94      0.94      1916
          15       0.98      0.97      0.97      2074
          16       0.85      0.83      0.84      1818
          17       0.81      0.84      0.82      1631
          18       0.96      0.99      0.97      2090
          19       0.98      0.92      0.95      2019
          20       0.85      0.95      0.90      1923
          21       0.67      0.74      0.70      1654
          22       0.92      0.92      0.92      1934
          23       0.97      0.96      0.97      2114
          24       0.99      1.00      1.00     18531

    accuracy                           0.96     91973
   macro avg       0.92      0.92      0.92     91973
weighted avg       0.96      0.96      0.96     91973

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (112878, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (85052, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
[W] [16:33:51.947469] L-BFGS: max iterations reached
[W] [16:33:51.948901] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
              precision    recall  f1-score   support

           0       0.81      0.95      0.87      1850
           1       0.97      0.64      0.77      2044
           2       0.97      0.95      0.96      2332
           3       1.00      1.00      1.00     22599
           4       0.73      0.65      0.69      1901
           5       0.54      0.64      0.58      1492
           6       0.94      0.89      0.92      2095
           7       0.89      0.95      0.92      2384
           8       0.92      0.94      0.93      2145
           9       0.91      0.70      0.79      2358
          10       0.88      0.95      0.92      2340
          11       0.97      0.94      0.95      2095
          12       0.95      0.99      0.97      2085
          13       0.88      0.97      0.92      2117
          14       0.93      0.92      0.92      1751
          15       0.95      0.75      0.84      1855
          16       0.74      0.49      0.59      1623
          17       0.52      0.61      0.56      1903
          18       0.91      0.98      0.94      2085
          19       0.97      0.91      0.94      2152
          20       0.65      0.94      0.77      1857
          21       0.46      0.30      0.37      1484
          22       0.91      0.92      0.91      1836
          23       0.95      0.97      0.96      2200
          24       0.96      0.99      0.98     16469

    accuracy                           0.91     85052
   macro avg       0.85      0.84      0.84     85052
weighted avg       0.91      0.91      0.91     85052


=== Overall Accuracy ===
0.9338983849888869 [0.9565307209724593, 0.9112660490053144]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.995899     0.955701     0.996822 0.873476 0.998981
        CLTC_WT_Untreated  0.994826     0.820524     0.999455 0.975597 0.995253
Calreticulin_WT_Untreated  0.998260     0.955112     0.999363 0.974555 0.998854
        DAPI_WT_Untreated  0.999486     0.999831     0.999360 0.998253 0.999938
       DCP1A_WT_Untreated  0.989295     0.722275     0.995816 0.808274 0.993235
        FMRP_WT_Untreated  0.987137     0.773902     0.992193 0.701526 0.994626
         FUS_WT_Untreated  0.996876     0.918033     0.998853 0.952563 0.997946
       G3BP1_WT_Untreated  0.997080     0.965369     0.997935 0.926507 0.999065
       GM130_WT_Untreated  0.996983     0.943596     0.998298 0.931771 0.998610
       KIF5A_WT_Untreated  0.991408     0.759077     0.997415 0.883642 0.993793
       LAMP1_WT_Untreated  0.996667     0.960506     0.997628 0.914986 0.998949
 MitoTracker_WT_Untreated  0.998514     0.963519     0.999392 0.975456 0.999085
         NCL_WT_Untreated  0.998978     0.989796     0.999207 0.968899 0.999745
        NEMO_WT_Untreated  0.995961     0.919366     0.997948 0.920805 0.997908
         P54_WT_Untreated  0.997096     0.927734     0.998564 0.931800 0.998472
       PEX14_WT_Untreated  0.996419     0.868160     0.999330 0.967111 0.997014
         PML_WT_Untreated  0.990527     0.670445     0.996872 0.809474 0.993489
       PSD95_WT_Untreated  0.986471     0.714771     0.992005 0.645541 0.994177
        PURA_WT_Untreated  0.998012     0.985389     0.998316 0.933939 0.999647
  Phalloidin_WT_Untreated  0.997441     0.913450     0.999468 0.976422 0.997915
        SNCA_WT_Untreated  0.991719     0.942593     0.992791 0.740441 0.998740
      SQSTM1_WT_Untreated  0.985234     0.533142     0.993392 0.592842 0.991590
       TDP43_WT_Untreated  0.996424     0.918037     0.998130 0.914399 0.998216
      TOMM20_WT_Untreated  0.998192     0.964070     0.999045 0.961841 0.999102
        TUJ1_WT_Untreated  0.994662     0.995029     0.994571 0.978341 0.998770
            Macro Average  0.994783     0.883177     0.997287 0.890338 0.997325
Out[42]:
{'Accuracy': 0.9947826578166924,
 'Sensitivity': 0.8831770315317321,
 'Specificity': 0.9972867688807299,
 'PPV': 0.8903383300699761,
 'NPV': 0.9973248723561033}
In [6]:
## Baseline
run_baseline_model(
    dataset_config= Cytoself_dataset_config,
    batches=[1, 2, 3],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
)
2025-08-20 11:53:48 INFO: [load_embeddings] multiplex=False
2025-08-20 11:53:48 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 11:53:48 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 11:53:48 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
Loading all batches...
2025-08-20 11:53:56 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 11:53:58 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 11:54:00 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 11:54:01 INFO: [load_embeddings] embeddings shape: (112878, 2048)
2025-08-20 11:54:01 INFO: [load_embeddings] labels shape: (112878,)
2025-08-20 11:54:01 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 11:54:01 INFO: [load_embeddings] paths shape: (112878,)
2025-08-20 11:54:01 INFO: [load_embeddings] multiplex=False
2025-08-20 11:54:01 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 11:54:01 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 11:54:01 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 11:54:07 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 11:54:10 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 11:54:11 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 11:54:12 INFO: [load_embeddings] embeddings shape: (91973, 2048)
2025-08-20 11:54:12 INFO: [load_embeddings] labels shape: (91973,)
2025-08-20 11:54:12 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 11:54:12 INFO: [load_embeddings] paths shape: (91973,)
2025-08-20 11:54:12 INFO: [load_embeddings] multiplex=False
2025-08-20 11:54:12 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 11:54:12 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 11:54:12 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 11:54:18 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 11:54:21 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 11:54:22 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 11:54:23 INFO: [load_embeddings] embeddings shape: (85052, 2048)
2025-08-20 11:54:23 INFO: [load_embeddings] labels shape: (85052,)
2025-08-20 11:54:23 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 11:54:23 INFO: [load_embeddings] paths shape: (85052,)
Batches loaded.
Training on Batches: [2, 3], Testing on: [1].

=== Batch [1] ===
Train: (177025, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (112878, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 3973
CLTC_WT_Untreated: 4580
Calreticulin_WT_Untreated: 4411
DAPI_WT_Untreated: 47422
DCP1A_WT_Untreated: 4220
FMRP_WT_Untreated: 4100
FUS_WT_Untreated: 4331
G3BP1_WT_Untreated: 4649
GM130_WT_Untreated: 4255
KIF5A_WT_Untreated: 4462
LAMP1_WT_Untreated: 4583
MitoTracker_WT_Untreated: 4331
NCL_WT_Untreated: 4312
NEMO_WT_Untreated: 4477
P54_WT_Untreated: 3667
PEX14_WT_Untreated: 3929
PML_WT_Untreated: 3441
PSD95_WT_Untreated: 3534
PURA_WT_Untreated: 4175
Phalloidin_WT_Untreated: 4171
SNCA_WT_Untreated: 3780
SQSTM1_WT_Untreated: 3138
TDP43_WT_Untreated: 3770
TOMM20_WT_Untreated: 4314
TUJ1_WT_Untreated: 35000
[W] [11:54:49.349903] L-BFGS: max iterations reached
[W] [11:54:49.350944] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
              precision    recall  f1-score   support

           0       0.95      0.94      0.94      2614
           1       0.94      0.98      0.96      2439
           2       0.99      0.98      0.98      3056
           3       1.00      1.00      1.00     30429
           4       0.75      0.85      0.80      2364
           5       0.83      0.76      0.79      2913
           6       0.97      0.97      0.97      2728
           7       0.98      0.98      0.98      2842
           8       0.93      0.96      0.95      2371
           9       0.76      0.85      0.80      2622
          10       0.97      0.96      0.97      3067
          11       0.99      0.99      0.99      2728
          12       1.00      0.99      0.99      2709
          13       0.94      0.96      0.95      2935
          14       0.94      0.96      0.95      2623
          15       0.96      0.98      0.97      2505
          16       0.80      0.85      0.83      2297
          17       0.73      0.68      0.71      2101
          18       0.99      0.97      0.98      2712
          19       0.92      0.97      0.95      2219
          20       0.93      0.82      0.87      2454
          21       0.68      0.61      0.65      2651
          22       0.93      0.92      0.93      2535
          23       0.97      0.99      0.98      2363
          24       1.00      0.99      1.00     22601

    accuracy                           0.95    112878
   macro avg       0.91      0.92      0.91    112878
weighted avg       0.95      0.95      0.95    112878

Training on Batches: [1, 3], Testing on: [2].

=== Batch [2] ===
Train: (197930, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (91973, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 4464
CLTC_WT_Untreated: 4483
Calreticulin_WT_Untreated: 5388
DAPI_WT_Untreated: 53028
DCP1A_WT_Untreated: 4265
FMRP_WT_Untreated: 4405
FUS_WT_Untreated: 4823
G3BP1_WT_Untreated: 5226
GM130_WT_Untreated: 4516
KIF5A_WT_Untreated: 4980
LAMP1_WT_Untreated: 5407
MitoTracker_WT_Untreated: 4823
NCL_WT_Untreated: 4794
NEMO_WT_Untreated: 5052
P54_WT_Untreated: 4374
PEX14_WT_Untreated: 4360
PML_WT_Untreated: 3920
PSD95_WT_Untreated: 4004
PURA_WT_Untreated: 4797
Phalloidin_WT_Untreated: 4371
SNCA_WT_Untreated: 4311
SQSTM1_WT_Untreated: 4135
TDP43_WT_Untreated: 4371
TOMM20_WT_Untreated: 4563
TUJ1_WT_Untreated: 39070
[W] [11:55:11.191966] L-BFGS: max iterations reached
[W] [11:55:11.195505] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      2123
           1       0.98      0.97      0.97      2536
           2       0.98      0.97      0.98      2079
           3       1.00      1.00      1.00     24823
           4       0.80      0.79      0.80      2319
           5       0.79      0.77      0.78      2608
           6       0.96      0.94      0.95      2236
           7       0.98      0.98      0.98      2265
           8       0.94      0.94      0.94      2110
           9       0.84      0.86      0.85      2104
          10       0.96      0.97      0.96      2243
          11       0.98      0.99      0.99      2236
          12       0.99      0.98      0.99      2227
          13       0.97      0.82      0.88      2360
          14       0.94      0.95      0.94      1916
          15       0.95      0.97      0.96      2074
          16       0.79      0.82      0.80      1818
          17       0.71      0.71      0.71      1631
          18       0.98      0.98      0.98      2090
          19       0.97      0.95      0.96      2019
          20       0.86      0.88      0.87      1923
          21       0.60      0.72      0.65      1654
          22       0.92      0.92      0.92      1934
          23       0.98      0.97      0.97      2114
          24       0.99      1.00      1.00     18531

    accuracy                           0.95     91973
   macro avg       0.91      0.91      0.91     91973
weighted avg       0.95      0.95      0.95     91973

Training on Batches: [1, 2], Testing on: [3].

=== Batch [3] ===
Train: (204851, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (85052, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 4737
CLTC_WT_Untreated: 4975
Calreticulin_WT_Untreated: 5135
DAPI_WT_Untreated: 55252
DCP1A_WT_Untreated: 4683
FMRP_WT_Untreated: 5521
FUS_WT_Untreated: 4964
G3BP1_WT_Untreated: 5107
GM130_WT_Untreated: 4481
KIF5A_WT_Untreated: 4726
LAMP1_WT_Untreated: 5310
MitoTracker_WT_Untreated: 4964
NCL_WT_Untreated: 4936
NEMO_WT_Untreated: 5295
P54_WT_Untreated: 4539
PEX14_WT_Untreated: 4579
PML_WT_Untreated: 4115
PSD95_WT_Untreated: 3732
PURA_WT_Untreated: 4802
Phalloidin_WT_Untreated: 4238
SNCA_WT_Untreated: 4377
SQSTM1_WT_Untreated: 4305
TDP43_WT_Untreated: 4469
TOMM20_WT_Untreated: 4477
TUJ1_WT_Untreated: 41132
[W] [11:55:34.194890] L-BFGS: max iterations reached
[W] [11:55:34.200548] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
              precision    recall  f1-score   support

           0       0.85      0.95      0.90      1850
           1       0.97      0.70      0.82      2044
           2       0.97      0.98      0.98      2332
           3       1.00      1.00      1.00     22599
           4       0.71      0.67      0.69      1901
           5       0.54      0.58      0.56      1492
           6       0.93      0.93      0.93      2095
           7       0.92      0.94      0.93      2384
           8       0.93      0.94      0.94      2145
           9       0.93      0.78      0.85      2358
          10       0.89      0.96      0.92      2340
          11       0.96      0.95      0.96      2095
          12       0.96      0.99      0.98      2085
          13       0.82      0.97      0.89      2117
          14       0.93      0.91      0.92      1751
          15       0.96      0.76      0.85      1855
          16       0.71      0.52      0.60      1623
          17       0.52      0.60      0.56      1903
          18       0.94      0.99      0.96      2085
          19       0.97      0.96      0.96      2152
          20       0.70      0.92      0.80      1857
          21       0.51      0.30      0.38      1484
          22       0.90      0.92      0.91      1836
          23       0.96      0.97      0.97      2200
          24       0.97      0.99      0.98     16469

    accuracy                           0.92     85052
   macro avg       0.86      0.85      0.85     85052
weighted avg       0.92      0.92      0.92     85052


=== Overall Accuracy ===
0.939875249571192 [0.9517886567798862, 0.9498222304371935, 0.9180148614964962]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.996899     0.943525     0.998140 0.921833 0.998686
        CLTC_WT_Untreated  0.996602     0.893860     0.999152 0.963156 0.997371
Calreticulin_WT_Untreated  0.998951     0.978706     0.999487 0.980545 0.999437
        DAPI_WT_Untreated  0.999727     0.999576     0.999783 0.999409 0.999844
       DCP1A_WT_Untreated  0.989300     0.780377     0.994155 0.756256 0.994892
        FMRP_WT_Untreated  0.987385     0.724654     0.993899 0.746475 0.993179
         FUS_WT_Untreated  0.997637     0.948576     0.998862 0.954118 0.998717
       G3BP1_WT_Untreated  0.998086     0.965826     0.998941 0.960313 0.999093
       GM130_WT_Untreated  0.997282     0.948989     0.998411 0.933215 0.998806
       KIF5A_WT_Untreated  0.991690     0.825381     0.995856 0.833025 0.995627
       LAMP1_WT_Untreated  0.997447     0.962222     0.998402 0.942268 0.998976
 MitoTracker_WT_Untreated  0.998976     0.977334     0.999516 0.980529 0.999434
         NCL_WT_Untreated  0.999383     0.989318     0.999632 0.985248 0.999735
        NEMO_WT_Untreated  0.995457     0.916622     0.997526 0.906713 0.997812
         P54_WT_Untreated  0.997289     0.939905     0.998561 0.935443 0.998667
       PEX14_WT_Untreated  0.997192     0.911719     0.999132 0.959751 0.997999
         PML_WT_Untreated  0.990721     0.750087     0.995580 0.774101 0.994957
       PSD95_WT_Untreated  0.986385     0.664241     0.992771 0.645567 0.993340
        PURA_WT_Untreated  0.998775     0.978220     0.999276 0.970470 0.999470
  Phalloidin_WT_Untreated  0.998041     0.959937     0.998900 0.951598 0.999097
        SNCA_WT_Untreated  0.993156     0.869586     0.995872 0.822360 0.997130
      SQSTM1_WT_Untreated  0.984364     0.565901     0.992890 0.618580 0.991170
       TDP43_WT_Untreated  0.996547     0.921015     0.998226 0.920285 0.998244
      TOMM20_WT_Untreated  0.998762     0.974989     0.999322 0.971352 0.999410
        TUJ1_WT_Untreated  0.996457     0.994115     0.997038 0.988128 0.998538
            Macro Average  0.995300     0.895387     0.997573 0.896830 0.997585
In [7]:
run_train_test_split_baseline(
    Cytoself_dataset_config,        
    batches=[1],  
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},     
)
2025-08-20 11:55:37 INFO: [load_embeddings] multiplex=False
2025-08-20 11:55:37 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 11:55:37 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 11:55:37 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 11:55:45 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 11:55:48 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 11:55:49 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 11:55:50 INFO: [load_embeddings] embeddings shape: (112878, 2048)
2025-08-20 11:55:50 INFO: [load_embeddings] labels shape: (112878,)
2025-08-20 11:55:50 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 11:55:50 INFO: [load_embeddings] paths shape: (112878,)
Train dataset
(90302,) (90302, 2048) [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
2: 2445
17: 1681
3: 24343
8: 1897
24: 18081
5: 2330
14: 2098
16: 1838
4: 1891
12: 2167
15: 2004
9: 2098
7: 2274
6: 2182
0: 2091
18: 2170
21: 2121
13: 2348
11: 2182
10: 2454
20: 1963
22: 2028
1: 1951
19: 1775
23: 1890
Test dataset
(22576,) (22576, 2048) [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
16: 459
15: 501
21: 530
24: 4520
3: 6086
12: 542
20: 491
19: 444
5: 583
13: 587
7: 568
6: 546
1: 488
8: 474
4: 473
0: 523
18: 542
22: 507
10: 613
2: 611
17: 420
11: 546
23: 473
9: 524
14: 525
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       523
           1       0.98      0.96      0.97       488
           2       1.00      0.99      0.99       611
           3       1.00      1.00      1.00      6086
           4       0.87      0.89      0.88       473
           5       0.85      0.84      0.84       583
           6       0.97      0.97      0.97       546
           7       0.98      0.98      0.98       568
           8       0.96      0.97      0.96       474
           9       0.80      0.84      0.82       524
          10       0.97      0.99      0.98       613
          11       1.00      0.99      1.00       546
          12       0.99      1.00      0.99       542
          13       0.96      0.95      0.96       587
          14       0.97      0.96      0.96       525
          15       0.97      0.98      0.98       501
          16       0.88      0.85      0.86       459
          17       0.84      0.85      0.84       420
          18       0.98      0.98      0.98       542
          19       0.98      0.98      0.98       444
          20       0.93      0.91      0.92       491
          21       0.75      0.75      0.75       530
          22       0.94      0.94      0.94       507
          23       0.99      0.97      0.98       473
          24       1.00      1.00      1.00      4520

    accuracy                           0.97     22576
   macro avg       0.94      0.94      0.94     22576
weighted avg       0.97      0.97      0.97     22576

Accuracy: 0.9659

=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.998184     0.967495     0.998912 0.954717 0.999229
        CLTC_WT_Untreated  0.998848     0.963115     0.999638 0.983264 0.999185
Calreticulin_WT_Untreated  0.999601     0.990180     0.999863 0.995066 0.999727
        DAPI_WT_Untreated  0.999779     0.999343     0.999939 0.999836 0.999757
       DCP1A_WT_Untreated  0.994995     0.890063     0.997240 0.873444 0.997646
        FMRP_WT_Untreated  0.991938     0.842196     0.995908 0.845095 0.995817
         FUS_WT_Untreated  0.998405     0.967033     0.999183 0.967033 0.999183
       G3BP1_WT_Untreated  0.999114     0.980634     0.999591 0.984099 0.999500
       GM130_WT_Untreated  0.998450     0.968354     0.999095 0.958246 0.999321
       KIF5A_WT_Untreated  0.991495     0.837786     0.995148 0.804029 0.996142
       LAMP1_WT_Untreated  0.998804     0.985318     0.999180 0.971061 0.999590
 MitoTracker_WT_Untreated  0.999779     0.994505     0.999909 0.996330 0.999864
         NCL_WT_Untreated  0.999734     0.996310     0.999818 0.992647 0.999909
        NEMO_WT_Untreated  0.997741     0.952300     0.998954 0.960481 0.998727
         P54_WT_Untreated  0.998361     0.960000     0.999274 0.969231 0.999048
       PEX14_WT_Untreated  0.999070     0.984032     0.999411 0.974308 0.999638
         PML_WT_Untreated  0.994463     0.845316     0.997558 0.877828 0.996792
       PSD95_WT_Untreated  0.994153     0.847619     0.996931 0.839623 0.997111
        PURA_WT_Untreated  0.999026     0.981550     0.999455 0.977941 0.999546
  Phalloidin_WT_Untreated  0.999114     0.979730     0.999503 0.975336 0.999593
        SNCA_WT_Untreated  0.996545     0.912424     0.998415 0.927536 0.998054
      SQSTM1_WT_Untreated  0.988306     0.747170     0.994103 0.752852 0.993923
       TDP43_WT_Untreated  0.997431     0.942801     0.998686 0.942801 0.998686
      TOMM20_WT_Untreated  0.999158     0.972516     0.999729 0.987124 0.999412
        TUJ1_WT_Untreated  0.999291     0.998230     0.999557 0.998230 0.999557
            Macro Average  0.997271     0.940241     0.998600 0.940326 0.998598
In [43]:
## Baseline
run_baseline_model(
    dataset_config= Cytoself_dataset_config,
    batches=[1, 2, 3],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=GaussianNB,
    classifier_kwargs={},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:33:54 INFO: [load_embeddings] multiplex=False
2025-08-20 16:33:54 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:33:54 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 16:33:54 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
Loading all batches...
2025-08-20 16:34:01 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:34:04 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:34:05 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:34:06 INFO: [load_embeddings] embeddings shape: (112878, 2048)
2025-08-20 16:34:06 INFO: [load_embeddings] labels shape: (112878,)
2025-08-20 16:34:06 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:34:06 INFO: [load_embeddings] paths shape: (112878,)
2025-08-20 16:34:06 INFO: [load_embeddings] multiplex=False
2025-08-20 16:34:06 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:34:06 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 16:34:06 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 16:34:12 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:34:14 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:34:16 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:34:16 INFO: [load_embeddings] embeddings shape: (91973, 2048)
2025-08-20 16:34:16 INFO: [load_embeddings] labels shape: (91973,)
2025-08-20 16:34:16 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:34:16 INFO: [load_embeddings] paths shape: (91973,)
2025-08-20 16:34:16 INFO: [load_embeddings] multiplex=False
2025-08-20 16:34:16 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:34:16 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 16:34:16 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 16:34:22 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:34:24 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:34:25 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:34:26 INFO: [load_embeddings] embeddings shape: (85052, 2048)
2025-08-20 16:34:26 INFO: [load_embeddings] labels shape: (85052,)
2025-08-20 16:34:26 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:34:26 INFO: [load_embeddings] paths shape: (85052,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (112878, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (91973, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
              precision    recall  f1-score   support

           0       0.79      0.77      0.78      2123
           1       0.87      0.81      0.84      2536
           2       0.92      0.87      0.89      2079
           3       1.00      0.99      0.99     24823
           4       0.56      0.60      0.58      2319
           5       0.60      0.42      0.49      2608
           6       0.84      0.83      0.83      2236
           7       0.79      0.92      0.85      2265
           8       0.84      0.75      0.79      2110
           9       0.63      0.65      0.64      2104
          10       0.75      0.76      0.76      2243
          11       0.76      0.94      0.84      2236
          12       0.90      0.91      0.90      2227
          13       0.83      0.66      0.73      2360
          14       0.87      0.77      0.82      1916
          15       0.78      0.91      0.84      2074
          16       0.61      0.62      0.61      1818
          17       0.42      0.80      0.55      1631
          18       0.83      0.92      0.87      2090
          19       0.84      0.79      0.81      2019
          20       0.55      0.71      0.62      1923
          21       0.40      0.29      0.34      1654
          22       0.72      0.81      0.76      1934
          23       0.87      0.78      0.82      2114
          24       0.99      0.95      0.97     18531

    accuracy                           0.86     91973
   macro avg       0.76      0.77      0.76     91973
weighted avg       0.86      0.86      0.86     91973

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (112878, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (85052, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
              precision    recall  f1-score   support

           0       0.74      0.77      0.75      1850
           1       0.81      0.46      0.59      2044
           2       0.90      0.89      0.89      2332
           3       1.00      0.99      0.99     22599
           4       0.48      0.48      0.48      1901
           5       0.23      0.17      0.20      1492
           6       0.78      0.83      0.80      2095
           7       0.66      0.91      0.76      2384
           8       0.83      0.71      0.76      2145
           9       0.79      0.60      0.68      2358
          10       0.70      0.75      0.73      2340
          11       0.73      0.92      0.82      2095
          12       0.83      0.90      0.86      2085
          13       0.74      0.82      0.78      2117
          14       0.86      0.76      0.81      1751
          15       0.72      0.78      0.75      1855
          16       0.44      0.37      0.40      1623
          17       0.35      0.55      0.43      1903
          18       0.76      0.96      0.85      2085
          19       0.89      0.76      0.82      2152
          20       0.50      0.74      0.60      1857
          21       0.31      0.14      0.19      1484
          22       0.74      0.83      0.78      1836
          23       0.80      0.76      0.78      2200
          24       0.98      0.93      0.95     16469

    accuracy                           0.82     85052
   macro avg       0.70      0.71      0.70     85052
weighted avg       0.83      0.82      0.82     85052


=== Overall Accuracy ===
0.8395543923084285 [0.8551422700140259, 0.8239665146028312]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.989391     0.767430     0.994487 0.761679 0.994660
        CLTC_WT_Untreated  0.987990     0.651310     0.996932 0.849374 0.990796
Calreticulin_WT_Untreated  0.994752     0.878259     0.997729 0.908111 0.996892
        DAPI_WT_Untreated  0.995758     0.986040     0.999313 0.998100 0.994915
       DCP1A_WT_Untreated  0.977370     0.541706     0.988010 0.524553 0.988799
        FMRP_WT_Untreated  0.975371     0.328049     0.990719 0.455932 0.984173
         FUS_WT_Untreated  0.990911     0.827984     0.994997 0.805843 0.995683
       G3BP1_WT_Untreated  0.988307     0.912669     0.990347 0.718300 0.997627
       GM130_WT_Untreated  0.990018     0.729495     0.996435 0.834409 0.993359
       KIF5A_WT_Untreated  0.983901     0.624160     0.993202 0.703638 0.990310
       LAMP1_WT_Untreated  0.986324     0.756491     0.992432 0.726530 0.993521
 MitoTracker_WT_Untreated  0.990611     0.930732     0.992113 0.747450 0.998252
         NCL_WT_Untreated  0.994238     0.907236     0.996410 0.863195 0.997681
        NEMO_WT_Untreated  0.988036     0.731740     0.994686 0.781302 0.993051
         P54_WT_Untreated  0.992713     0.767112     0.997485 0.865805 0.995086
       PEX14_WT_Untreated  0.990578     0.849071     0.993790 0.756291 0.996565
         PML_WT_Untreated  0.981963     0.497820     0.991560 0.539018 0.990060
       PSD95_WT_Untreated  0.972043     0.663271     0.978333 0.384073 0.993038
        PURA_WT_Untreated  0.992634     0.938443     0.993943 0.789124 0.998506
  Phalloidin_WT_Untreated  0.991849     0.774155     0.997102 0.865684 0.994564
        SNCA_WT_Untreated  0.980076     0.725926     0.985622 0.524164 0.993969
      SQSTM1_WT_Untreated  0.979568     0.218611     0.993300 0.370610 0.986002
       TDP43_WT_Untreated  0.989696     0.817241     0.993449 0.730787 0.996013
      TOMM20_WT_Untreated  0.990634     0.771210     0.996115 0.832166 0.994296
        TUJ1_WT_Untreated  0.985595     0.938229     0.997268 0.988322 0.984965
            Macro Average  0.987213     0.741376     0.993431 0.732978 0.993311
Out[43]:
{'Accuracy': 0.9872131054935742,
 'Sensitivity': 0.7413756087993754,
 'Specificity': 0.993431117325448,
 'PPV': 0.7329784414077817,
 'NPV': 0.9933113358320611}
In [44]:
## Baseline
run_baseline_model(
    dataset_config= Cytoself_dataset_config,
    batches=[1, 2, 3],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=RidgeClassifier,
    classifier_kwargs={},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:37:06 INFO: [load_embeddings] multiplex=False
2025-08-20 16:37:06 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:37:06 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 16:37:06 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
Loading all batches...
2025-08-20 16:37:21 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:37:27 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:37:31 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:37:32 INFO: [load_embeddings] embeddings shape: (112878, 2048)
2025-08-20 16:37:32 INFO: [load_embeddings] labels shape: (112878,)
2025-08-20 16:37:32 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:37:32 INFO: [load_embeddings] paths shape: (112878,)
2025-08-20 16:37:33 INFO: [load_embeddings] multiplex=False
2025-08-20 16:37:33 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:37:33 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 16:37:33 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 16:37:46 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:37:50 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:37:53 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:37:54 INFO: [load_embeddings] embeddings shape: (91973, 2048)
2025-08-20 16:37:54 INFO: [load_embeddings] labels shape: (91973,)
2025-08-20 16:37:54 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:37:54 INFO: [load_embeddings] paths shape: (91973,)
2025-08-20 16:37:55 INFO: [load_embeddings] multiplex=False
2025-08-20 16:37:55 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:37:55 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 16:37:55 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 16:38:07 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:38:11 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:38:14 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:38:15 INFO: [load_embeddings] embeddings shape: (85052, 2048)
2025-08-20 16:38:15 INFO: [load_embeddings] labels shape: (85052,)
2025-08-20 16:38:15 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:38:15 INFO: [load_embeddings] paths shape: (85052,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (112878, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (91973, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/linear_model/_ridge.py:211: LinAlgWarning: Ill-conditioned matrix (rcond=3.35676e-10): result may not be accurate.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      2123
           1       0.95      0.92      0.93      2536
           2       0.89      0.92      0.91      2079
           3       0.99      1.00      1.00     24823
           4       0.84      0.71      0.77      2319
           5       0.76      0.72      0.74      2608
           6       0.87      0.91      0.89      2236
           7       0.91      0.96      0.94      2265
           8       0.86      0.92      0.89      2110
           9       0.77      0.75      0.76      2104
          10       0.84      0.93      0.88      2243
          11       0.98      0.93      0.95      2236
          12       0.98      0.92      0.95      2227
          13       0.83      0.91      0.87      2360
          14       0.89      0.89      0.89      1916
          15       0.89      0.96      0.92      2074
          16       0.88      0.55      0.67      1818
          17       0.82      0.67      0.74      1631
          18       0.79      0.97      0.87      2090
          19       0.96      0.76      0.85      2019
          20       0.79      0.83      0.81      1923
          21       0.66      0.56      0.60      1654
          22       0.83      0.84      0.84      1934
          23       0.93      0.91      0.92      2114
          24       0.97      1.00      0.98     18531

    accuracy                           0.92     91973
   macro avg       0.87      0.85      0.86     91973
weighted avg       0.92      0.92      0.92     91973

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (112878, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (85052, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/linear_model/_ridge.py:211: LinAlgWarning: Ill-conditioned matrix (rcond=3.35676e-10): result may not be accurate.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
              precision    recall  f1-score   support

           0       0.71      0.90      0.79      1850
           1       0.89      0.43      0.58      2044
           2       0.91      0.95      0.93      2332
           3       0.99      1.00      0.99     22599
           4       0.71      0.50      0.59      1901
           5       0.38      0.36      0.37      1492
           6       0.86      0.82      0.84      2095
           7       0.81      0.93      0.87      2384
           8       0.89      0.85      0.87      2145
           9       0.78      0.56      0.65      2358
          10       0.74      0.94      0.83      2340
          11       0.96      0.84      0.89      2095
          12       0.97      0.96      0.96      2085
          13       0.63      0.99      0.77      2117
          14       0.87      0.83      0.85      1751
          15       0.83      0.69      0.75      1855
          16       0.78      0.35      0.49      1623
          17       0.51      0.39      0.44      1903
          18       0.67      0.98      0.79      2085
          19       0.98      0.72      0.83      2152
          20       0.57      0.79      0.66      1857
          21       0.46      0.23      0.30      1484
          22       0.79      0.86      0.82      1836
          23       0.91      0.91      0.91      2200
          24       0.93      0.98      0.96     16469

    accuracy                           0.86     85052
   macro avg       0.78      0.75      0.75     85052
weighted avg       0.86      0.86      0.85     85052


=== Overall Accuracy ===
0.8897641608992843 [0.9188783664771183, 0.8606499553214504]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.992035     0.913416     0.993840 0.772950 0.998004
        CLTC_WT_Untreated  0.990905     0.697380     0.998701 0.934465 0.992016
Calreticulin_WT_Untreated  0.995893     0.937202     0.997393 0.901832 0.998394
        DAPI_WT_Untreated  0.997430     0.999873     0.996536 0.990619 0.999954
       DCP1A_WT_Untreated  0.986748     0.614929     0.995828 0.782569 0.990645
        FMRP_WT_Untreated  0.982296     0.585610     0.991702 0.625912 0.990190
         FUS_WT_Untreated  0.993459     0.866774     0.996636 0.865975 0.996659
       G3BP1_WT_Untreated  0.994515     0.946870     0.995800 0.858759 0.998563
       GM130_WT_Untreated  0.994170     0.885546     0.996846 0.873638 0.997180
       KIF5A_WT_Untreated  0.986318     0.650381     0.995005 0.770988 0.990996
       LAMP1_WT_Untreated  0.991606     0.936723     0.993064 0.782110 0.998309
 MitoTracker_WT_Untreated  0.996469     0.886631     0.999224 0.966281 0.997163
         NCL_WT_Untreated  0.997995     0.941095     0.999415 0.975715 0.998531
        NEMO_WT_Untreated  0.989165     0.947063     0.990258 0.716095 0.998615
         P54_WT_Untreated  0.994713     0.859013     0.997583 0.882600 0.997019
       PEX14_WT_Untreated  0.993363     0.831000     0.997048 0.864672 0.996167
         PML_WT_Untreated  0.987708     0.454519     0.998277 0.839506 0.989284
       PSD95_WT_Untreated  0.985070     0.516978     0.994605 0.661238 0.990204
        PURA_WT_Untreated  0.990662     0.974371     0.991056 0.724617 0.999376
  Phalloidin_WT_Untreated  0.993278     0.741309     0.999358 0.965345 0.993793
        SNCA_WT_Untreated  0.987301     0.814550     0.991070 0.665586 0.995934
      SQSTM1_WT_Untreated  0.984432     0.400255     0.994974 0.589671 0.989239
       TDP43_WT_Untreated  0.992560     0.850133     0.995660 0.809957 0.996735
      TOMM20_WT_Untreated  0.995865     0.911219     0.997979 0.918458 0.997783
        TUJ1_WT_Untreated  0.987849     0.989971     0.987326 0.950616 0.997503
            Macro Average  0.991272     0.806112     0.995407 0.827607 0.995530
Out[44]:
{'Accuracy': 0.9912721931930517,
 'Sensitivity': 0.8061124950415774,
 'Specificity': 0.9954072922750745,
 'PPV': 0.827607007333263,
 'NPV': 0.99553027214942}
In [45]:
## Baseline
run_baseline_model(
    dataset_config= Cytoself_dataset_config,
    batches=[1, 2, 3],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=LinearSVC,
    classifier_kwargs={"C": 1.0, "max_iter": 1000, "random_state": 42},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:39:41 INFO: [load_embeddings] multiplex=False
2025-08-20 16:39:41 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:39:41 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 16:39:41 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
Loading all batches...
2025-08-20 16:39:56 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:40:02 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:40:05 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:40:07 INFO: [load_embeddings] embeddings shape: (112878, 2048)
2025-08-20 16:40:07 INFO: [load_embeddings] labels shape: (112878,)
2025-08-20 16:40:07 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:40:07 INFO: [load_embeddings] paths shape: (112878,)
2025-08-20 16:40:08 INFO: [load_embeddings] multiplex=False
2025-08-20 16:40:08 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:40:08 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 16:40:08 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 16:40:20 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:40:24 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:40:27 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:40:28 INFO: [load_embeddings] embeddings shape: (91973, 2048)
2025-08-20 16:40:28 INFO: [load_embeddings] labels shape: (91973,)
2025-08-20 16:40:28 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:40:28 INFO: [load_embeddings] paths shape: (91973,)
2025-08-20 16:40:29 INFO: [load_embeddings] multiplex=False
2025-08-20 16:40:29 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:40:29 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 16:40:29 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 16:40:40 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:40:45 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:40:48 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:40:49 INFO: [load_embeddings] embeddings shape: (85052, 2048)
2025-08-20 16:40:49 INFO: [load_embeddings] labels shape: (85052,)
2025-08-20 16:40:49 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:40:49 INFO: [load_embeddings] paths shape: (85052,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (112878, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (91973, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.
  warnings.warn(
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_base.py:1242: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      2123
           1       0.97      0.94      0.96      2536
           2       0.97      0.95      0.96      2079
           3       1.00      1.00      1.00     24823
           4       0.83      0.70      0.76      2319
           5       0.61      0.89      0.73      2608
           6       0.96      0.91      0.94      2236
           7       0.96      0.97      0.97      2265
           8       0.94      0.92      0.93      2110
           9       0.80      0.77      0.78      2104
          10       0.93      0.95      0.94      2243
          11       0.98      0.98      0.98      2236
          12       0.98      0.99      0.98      2227
          13       0.95      0.83      0.89      2360
          14       0.93      0.91      0.92      1916
          15       0.97      0.95      0.96      2074
          16       0.87      0.67      0.75      1818
          17       0.76      0.76      0.76      1631
          18       0.95      0.98      0.96      2090
          19       0.96      0.89      0.93      2019
          20       0.82      0.89      0.86      1923
          21       0.61      0.59      0.60      1654
          22       0.87      0.88      0.88      1934
          23       0.97      0.95      0.96      2114
          24       0.99      1.00      1.00     18531

    accuracy                           0.94     91973
   macro avg       0.90      0.89      0.89     91973
weighted avg       0.94      0.94      0.94     91973

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (112878, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (85052, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.
  warnings.warn(
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_base.py:1242: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
              precision    recall  f1-score   support

           0       0.77      0.94      0.84      1850
           1       0.95      0.59      0.73      2044
           2       0.93      0.96      0.95      2332
           3       1.00      1.00      1.00     22599
           4       0.70      0.55      0.62      1901
           5       0.36      0.67      0.47      1492
           6       0.93      0.86      0.89      2095
           7       0.86      0.94      0.90      2384
           8       0.94      0.90      0.92      2145
           9       0.85      0.59      0.69      2358
          10       0.89      0.94      0.92      2340
          11       0.95      0.94      0.95      2095
          12       0.94      0.99      0.96      2085
          13       0.87      0.95      0.91      2117
          14       0.91      0.88      0.89      1751
          15       0.95      0.75      0.84      1855
          16       0.78      0.40      0.53      1623
          17       0.46      0.50      0.48      1903
          18       0.86      0.99      0.92      2085
          19       0.97      0.88      0.92      2152
          20       0.66      0.90      0.76      1857
          21       0.34      0.22      0.27      1484
          22       0.87      0.90      0.88      1836
          23       0.95      0.95      0.95      2200
          24       0.97      0.99      0.98     16469

    accuracy                           0.89     85052
   macro avg       0.83      0.81      0.81     85052
weighted avg       0.90      0.89      0.89     85052


=== Overall Accuracy ===
0.9159204645033483 [0.9387755102040817, 0.8930654188026149]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.994617     0.945885     0.995735 0.835854 0.998754
        CLTC_WT_Untreated  0.993679     0.782969     0.999275 0.966316 0.994265
Calreticulin_WT_Untreated  0.997611     0.958513     0.998610 0.946285 0.998939
        DAPI_WT_Untreated  0.999362     0.999852     0.999182 0.997769 0.999946
       DCP1A_WT_Untreated  0.986798     0.635782     0.995371 0.770313 0.991143
        FMRP_WT_Untreated  0.977286     0.812927     0.981183 0.505997 0.995500
         FUS_WT_Untreated  0.996001     0.887555     0.998720 0.945633 0.997184
       G3BP1_WT_Untreated  0.996311     0.953753     0.997459 0.910099 0.998751
       GM130_WT_Untreated  0.996458     0.909048     0.998611 0.941577 0.997762
       KIF5A_WT_Untreated  0.988120     0.671672     0.996303 0.824484 0.991551
       LAMP1_WT_Untreated  0.996210     0.946323     0.997535 0.910752 0.998572
 MitoTracker_WT_Untreated  0.998300     0.962133     0.999207 0.968169 0.999050
         NCL_WT_Untreated  0.998650     0.987477     0.998929 0.958361 0.999687
        NEMO_WT_Untreated  0.994967     0.888765     0.997722 0.910110 0.997116
         P54_WT_Untreated  0.996215     0.898009     0.998293 0.917526 0.997844
       PEX14_WT_Untreated  0.996051     0.856198     0.999226 0.961692 0.996744
         PML_WT_Untreated  0.989013     0.543447     0.997845 0.833333 0.991012
       PSD95_WT_Untreated  0.983884     0.621109     0.991273 0.591804 0.992274
        PURA_WT_Untreated  0.997023     0.981557     0.997397 0.901055 0.999554
  Phalloidin_WT_Untreated  0.996588     0.887797     0.999213 0.964574 0.997298
        SNCA_WT_Untreated  0.990849     0.894974     0.992941 0.734477 0.997697
      SQSTM1_WT_Untreated  0.982483     0.415551     0.992714 0.507196 0.989487
       TDP43_WT_Untreated  0.994769     0.889390     0.997062 0.868203 0.997592
      TOMM20_WT_Untreated  0.997740     0.949930     0.998935 0.957029 0.998750
        TUJ1_WT_Untreated  0.994645     0.992886     0.995078 0.980282 0.998241
            Macro Average  0.993345     0.850940     0.996553 0.864356 0.996589
Out[45]:
{'Accuracy': 0.9933451207456574,
 'Sensitivity': 0.8509400883652646,
 'Specificity': 0.9965527205134995,
 'PPV': 0.8643556633095661,
 'NPV': 0.9965885035678497}
In [46]:
run_baseline_model(
    Cytoself_dataset_config,
    batches=[1,2,3,],
    classifier_class=cuRF,
    classifier_kwargs={"n_estimators": 300, "random_state": 42},  # max_depth=0 => unlimited in cuML
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:49:28 INFO: [load_embeddings] multiplex=False
2025-08-20 16:49:28 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:49:28 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 16:49:28 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
Loading all batches...
2025-08-20 16:49:43 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:49:49 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:49:53 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:49:55 INFO: [load_embeddings] embeddings shape: (112878, 2048)
2025-08-20 16:49:55 INFO: [load_embeddings] labels shape: (112878,)
2025-08-20 16:49:55 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:49:55 INFO: [load_embeddings] paths shape: (112878,)
2025-08-20 16:49:56 INFO: [load_embeddings] multiplex=False
2025-08-20 16:49:56 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:49:56 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 16:49:56 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 16:50:08 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:50:12 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:50:15 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:50:16 INFO: [load_embeddings] embeddings shape: (91973, 2048)
2025-08-20 16:50:16 INFO: [load_embeddings] labels shape: (91973,)
2025-08-20 16:50:16 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:50:16 INFO: [load_embeddings] paths shape: (91973,)
2025-08-20 16:50:17 INFO: [load_embeddings] multiplex=False
2025-08-20 16:50:17 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:50:17 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 16:50:17 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 16:50:39 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:50:44 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:50:47 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:50:48 INFO: [load_embeddings] embeddings shape: (85052, 2048)
2025-08-20 16:50:48 INFO: [load_embeddings] labels shape: (85052,)
2025-08-20 16:50:48 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:50:48 INFO: [load_embeddings] paths shape: (85052,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (112878, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (91973, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/cuml/internals/api_decorators.py:344: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
  return func(**kwargs)
              precision    recall  f1-score   support

           0       0.76      0.91      0.83      2123
           1       0.92      0.74      0.82      2536
           2       0.92      0.86      0.89      2079
           3       0.99      1.00      1.00     24823
           4       0.73      0.60      0.66      2319
           5       0.61      0.76      0.68      2608
           6       0.88      0.93      0.90      2236
           7       0.87      0.91      0.89      2265
           8       0.82      0.86      0.84      2110
           9       0.74      0.61      0.66      2104
          10       0.77      0.95      0.85      2243
          11       0.97      0.66      0.78      2236
          12       0.99      0.91      0.95      2227
          13       0.75      0.89      0.81      2360
          14       0.90      0.81      0.85      1916
          15       0.92      0.94      0.93      2074
          16       0.84      0.38      0.53      1818
          17       0.80      0.52      0.63      1631
          18       0.85      0.96      0.90      2090
          19       0.90      0.39      0.54      2019
          20       0.76      0.74      0.75      1923
          21       0.53      0.49      0.51      1654
          22       0.77      0.87      0.81      1934
          23       0.95      0.80      0.87      2114
          24       0.88      1.00      0.94     18531

    accuracy                           0.88     91973
   macro avg       0.83      0.78      0.79     91973
weighted avg       0.88      0.88      0.87     91973

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (112878, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (85052, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/cuml/internals/api_decorators.py:344: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
  return func(**kwargs)
              precision    recall  f1-score   support

           0       0.62      0.89      0.73      1850
           1       0.66      0.12      0.20      2044
           2       0.91      0.91      0.91      2332
           3       0.98      1.00      0.99     22599
           4       0.54      0.38      0.45      1901
           5       0.34      0.53      0.41      1492
           6       0.82      0.81      0.82      2095
           7       0.84      0.75      0.79      2384
           8       0.85      0.84      0.84      2145
           9       0.83      0.43      0.57      2358
          10       0.67      0.94      0.78      2340
          11       0.97      0.49      0.65      2095
          12       0.97      0.96      0.96      2085
          13       0.63      0.99      0.77      2117
          14       0.91      0.77      0.83      1751
          15       0.85      0.61      0.71      1855
          16       0.74      0.16      0.26      1623
          17       0.51      0.26      0.35      1903
          18       0.74      0.97      0.84      2085
          19       0.98      0.41      0.58      2152
          20       0.61      0.75      0.67      1857
          21       0.43      0.36      0.39      1484
          22       0.77      0.88      0.82      1836
          23       0.91      0.86      0.88      2200
          24       0.81      0.99      0.89     16469

    accuracy                           0.82     85052
   macro avg       0.75      0.68      0.68     85052
weighted avg       0.83      0.82      0.80     85052


=== Overall Accuracy ===
0.8487214343931828 [0.8779206941167517, 0.8195221746696139]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.988736     0.901586     0.990737 0.690839 0.997725
        CLTC_WT_Untreated  0.984522     0.462445     0.998388 0.883973 0.985902
Calreticulin_WT_Untreated  0.995080     0.883246     0.997938 0.916275 0.997019
        DAPI_WT_Untreated  0.996249     0.999937     0.994900 0.986252 0.999977
       DCP1A_WT_Untreated  0.981731     0.500474     0.993484 0.652254 0.987870
        FMRP_WT_Untreated  0.976608     0.679512     0.983652 0.496348 0.992334
         FUS_WT_Untreated  0.993182     0.869083     0.996294 0.854678 0.996715
       G3BP1_WT_Untreated  0.991883     0.827920     0.996305 0.858003 0.995363
       GM130_WT_Untreated  0.992317     0.848649     0.995856 0.834527 0.996271
       KIF5A_WT_Untreated  0.983957     0.514343     0.996100 0.773248 0.987550
       LAMP1_WT_Untreated  0.988894     0.947196     0.990002 0.715746 0.998584
 MitoTracker_WT_Untreated  0.989211     0.578850     0.999502 0.966834 0.989543
         NCL_WT_Untreated  0.997932     0.933673     0.999537 0.980516 0.998346
        NEMO_WT_Untreated  0.987442     0.933884     0.988832 0.684512 0.998268
         P54_WT_Untreated  0.993961     0.792746     0.998218 0.903918 0.995627
       PEX14_WT_Untreated  0.993170     0.787987     0.997828 0.891705 0.995200
         PML_WT_Untreated  0.984697     0.278117     0.998704 0.809645 0.985874
       PSD95_WT_Untreated  0.983669     0.381154     0.995942 0.656753 0.987501
        PURA_WT_Untreated  0.993250     0.967665     0.993868 0.792157 0.999215
  Phalloidin_WT_Untreated  0.985234     0.398945     0.999381 0.939582 0.985695
        SNCA_WT_Untreated  0.986872     0.745503     0.992138 0.674163 0.994434
      SQSTM1_WT_Untreated  0.981726     0.428617     0.991707 0.482598 0.989709
       TDP43_WT_Untreated  0.991645     0.875066     0.994182 0.765962 0.997273
      TOMM20_WT_Untreated  0.994323     0.833102     0.998350 0.926527 0.995842
        TUJ1_WT_Untreated  0.963435     0.996943     0.955177 0.845706 0.999212
            Macro Average  0.987989     0.734666     0.993481 0.799309 0.993882
Out[46]:
{'Accuracy': 0.9879890410958904,
 'Sensitivity': 0.7346657636295809,
 'Specificity': 0.9934807932408529,
 'PPV': 0.7993088532262571,
 'NPV': 0.9938820322409838}
In [47]:
run_baseline_model(Cytoself_dataset_config,
    batches=[1,2,3],
    classifier_class=ExtraTreesClassifier,
    classifier_kwargs={"n_estimators": 300, "max_depth": None, "min_samples_leaf": 1,
                                     "n_jobs": -1, "random_state": 42},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:53:07 INFO: [load_embeddings] multiplex=False
2025-08-20 16:53:07 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:53:07 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 16:53:07 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
Loading all batches...
2025-08-20 16:53:21 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:53:27 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:53:31 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:53:33 INFO: [load_embeddings] embeddings shape: (112878, 2048)
2025-08-20 16:53:33 INFO: [load_embeddings] labels shape: (112878,)
2025-08-20 16:53:33 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:53:33 INFO: [load_embeddings] paths shape: (112878,)
2025-08-20 16:53:33 INFO: [load_embeddings] multiplex=False
2025-08-20 16:53:33 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:53:33 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 16:53:33 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 16:53:45 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:53:49 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:53:52 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:53:53 INFO: [load_embeddings] embeddings shape: (91973, 2048)
2025-08-20 16:53:53 INFO: [load_embeddings] labels shape: (91973,)
2025-08-20 16:53:53 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:53:53 INFO: [load_embeddings] paths shape: (91973,)
2025-08-20 16:53:54 INFO: [load_embeddings] multiplex=False
2025-08-20 16:53:54 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 16:53:54 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 16:53:54 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
2025-08-20 16:54:54 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 16:54:58 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 16:55:01 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 16:55:02 INFO: [load_embeddings] embeddings shape: (85052, 2048)
2025-08-20 16:55:02 INFO: [load_embeddings] labels shape: (85052,)
2025-08-20 16:55:02 INFO: [load_embeddings] example label: ANAX11_WT_Untreated
2025-08-20 16:55:02 INFO: [load_embeddings] paths shape: (85052,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (112878, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (91973, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
              precision    recall  f1-score   support

           0       0.76      0.93      0.83      2123
           1       0.95      0.76      0.85      2536
           2       0.93      0.85      0.89      2079
           3       0.99      1.00      1.00     24823
           4       0.77      0.60      0.67      2319
           5       0.63      0.74      0.68      2608
           6       0.88      0.93      0.90      2236
           7       0.89      0.90      0.90      2265
           8       0.85      0.87      0.86      2110
           9       0.70      0.63      0.66      2104
          10       0.79      0.96      0.87      2243
          11       0.98      0.44      0.61      2236
          12       0.99      0.94      0.97      2227
          13       0.77      0.90      0.83      2360
          14       0.91      0.86      0.88      1916
          15       0.92      0.96      0.94      2074
          16       0.84      0.48      0.61      1818
          17       0.82      0.53      0.64      1631
          18       0.87      0.98      0.92      2090
          19       0.92      0.39      0.54      2019
          20       0.76      0.76      0.76      1923
          21       0.50      0.48      0.49      1654
          22       0.81      0.86      0.84      1934
          23       0.96      0.78      0.86      2114
          24       0.85      1.00      0.92     18531

    accuracy                           0.88     91973
   macro avg       0.84      0.78      0.80     91973
weighted avg       0.88      0.88      0.87     91973

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (112878, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
Test: (85052, 2048) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
              precision    recall  f1-score   support

           0       0.65      0.91      0.76      1850
           1       0.71      0.10      0.18      2044
           2       0.91      0.91      0.91      2332
           3       0.99      1.00      0.99     22599
           4       0.57      0.39      0.46      1901
           5       0.34      0.53      0.41      1492
           6       0.82      0.85      0.84      2095
           7       0.87      0.74      0.80      2384
           8       0.86      0.84      0.85      2145
           9       0.77      0.47      0.58      2358
          10       0.69      0.95      0.80      2340
          11       0.99      0.26      0.41      2095
          12       0.97      0.97      0.97      2085
          13       0.63      0.99      0.77      2117
          14       0.91      0.83      0.87      1751
          15       0.84      0.66      0.74      1855
          16       0.75      0.21      0.33      1623
          17       0.50      0.25      0.33      1903
          18       0.77      0.98      0.86      2085
          19       0.98      0.42      0.59      2152
          20       0.65      0.79      0.71      1857
          21       0.44      0.35      0.39      1484
          22       0.82      0.88      0.85      1836
          23       0.94      0.83      0.88      2200
          24       0.78      1.00      0.87     16469

    accuracy                           0.82     85052
   macro avg       0.77      0.68      0.69     85052
weighted avg       0.83      0.82      0.80     85052


=== Overall Accuracy ===
0.8488203606606268 [0.8778010937992672, 0.8198396275219866]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.989470     0.919960     0.991066 0.702749 0.998149
        CLTC_WT_Untreated  0.985177     0.467031     0.998939 0.921189 0.986028
Calreticulin_WT_Untreated  0.995227     0.885287     0.998036 0.920123 0.997071
        DAPI_WT_Untreated  0.997328     0.999958     0.996366 0.990165 0.999985
       DCP1A_WT_Untreated  0.982765     0.502844     0.994485 0.690081 0.987939
        FMRP_WT_Untreated  0.976856     0.661707     0.984328 0.500277 0.991917
         FUS_WT_Untreated  0.993492     0.891249     0.996057 0.850033 0.997269
       G3BP1_WT_Untreated  0.992459     0.821682     0.997065 0.883033 0.995200
       GM130_WT_Untreated  0.993046     0.855229     0.996440 0.855430 0.996435
       KIF5A_WT_Untreated  0.983516     0.543478     0.994895 0.733515 0.988274
       LAMP1_WT_Untreated  0.989866     0.956360     0.990756 0.733311 0.998831
 MitoTracker_WT_Untreated  0.984036     0.352805     0.999867 0.985171 0.984026
         NCL_WT_Untreated  0.998503     0.957096     0.999537 0.980984 0.998930
        NEMO_WT_Untreated  0.987985     0.942819     0.989157 0.692876 0.998502
         P54_WT_Untreated  0.995001     0.845378     0.998166 0.906963 0.996734
       PEX14_WT_Untreated  0.993639     0.815475     0.997683 0.888766 0.995819
         PML_WT_Untreated  0.985838     0.351351     0.998416 0.814690 0.987285
       PSD95_WT_Untreated  0.983788     0.378891     0.996109 0.664846 0.987458
        PURA_WT_Untreated  0.994153     0.977725     0.994550 0.812500 0.999459
  Phalloidin_WT_Untreated  0.985465     0.403261     0.999514 0.952435 0.985798
        SNCA_WT_Untreated  0.988103     0.771429     0.992831 0.701299 0.995002
      SQSTM1_WT_Untreated  0.981511     0.417782     0.991684 0.475517 0.989516
       TDP43_WT_Untreated  0.992978     0.869496     0.995665 0.813601 0.997156
      TOMM20_WT_Untreated  0.994182     0.804590     0.998917 0.948879 0.995137
        TUJ1_WT_Untreated  0.955520     0.997543     0.945165 0.817620 0.999360
            Macro Average  0.987996     0.735617     0.993428 0.809442 0.993891
Out[47]:
{'Accuracy': 0.9879962717130348,
 'Sensitivity': 0.7356169771338387,
 'Specificity': 0.9934277589600906,
 'PPV': 0.8094420820850715,
 'NPV': 0.9938912399879954}
In [ ]:
 

Pretrained Model¶

In [48]:
pretrained_dataset_config = {
    "path_to_embeddings": "/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model",
    "multiplexed": False,
    "config_fmt": "NIH_UMAP1_DatasetConfig_B{batch}",
    "config_dir": "manuscript/manuscript_figures_data_config",
}
In [49]:
## Baseline
run_baseline_model(
    dataset_config= pretrained_dataset_config,
    batches=[1, 2, 3],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 17:37:26 INFO: [load_embeddings] multiplex=False
2025-08-20 17:37:26 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:37:26 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 17:37:26 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
Loading all batches...
2025-08-20 17:37:30 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:37:31 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:37:31 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:37:32 INFO: [load_embeddings] embeddings shape: (115590, 192)
2025-08-20 17:37:32 INFO: [load_embeddings] labels shape: (115590,)
2025-08-20 17:37:32 INFO: [load_embeddings] example label: CLTC_WT_Untreated
2025-08-20 17:37:32 INFO: [load_embeddings] paths shape: (115590,)
2025-08-20 17:37:32 INFO: [load_embeddings] multiplex=False
2025-08-20 17:37:32 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:37:32 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 17:37:32 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
2025-08-20 17:37:35 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:37:35 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:37:36 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:37:36 INFO: [load_embeddings] embeddings shape: (94059, 192)
2025-08-20 17:37:36 INFO: [load_embeddings] labels shape: (94059,)
2025-08-20 17:37:36 INFO: [load_embeddings] example label: DAPI_WT_Untreated
2025-08-20 17:37:36 INFO: [load_embeddings] paths shape: (94059,)
2025-08-20 17:37:36 INFO: [load_embeddings] multiplex=False
2025-08-20 17:37:36 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:37:36 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 17:37:36 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
2025-08-20 17:37:39 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:37:39 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:37:40 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:37:40 INFO: [load_embeddings] embeddings shape: (87130, 192)
2025-08-20 17:37:40 INFO: [load_embeddings] labels shape: (87130,)
2025-08-20 17:37:40 INFO: [load_embeddings] example label: MitoTracker_WT_Untreated
2025-08-20 17:37:40 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (115590, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (94059, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
              precision    recall  f1-score   support

           0       0.91      0.94      0.92      2123
           1       0.99      0.95      0.97      2536
           2       0.97      0.94      0.96      2079
           3       1.00      1.00      1.00     24823
           4       0.89      0.82      0.85      2319
           5       0.85      0.88      0.87      2608
           6       0.98      0.92      0.95      2236
           7       0.96      0.98      0.97      2265
           8       0.97      0.96      0.96      2110
           9       0.89      0.87      0.88      2104
          10       0.95      0.96      0.96      2243
          11       0.97      0.97      0.97      2236
          12       0.99      0.99      0.99      2227
          13       0.95      0.93      0.94      2360
          14       0.94      0.92      0.93      1916
          15       0.96      0.97      0.96      2074
          16       0.92      0.85      0.89      1818
          17       0.88      0.88      0.88      1631
          18       0.95      0.97      0.96      2090
          19       0.98      0.91      0.95      2019
          20       0.90      0.94      0.92      1923
          21       0.76      0.85      0.80      1654
          22       0.87      0.92      0.89      1934
          23       0.91      0.94      0.93      2086
          24       0.97      0.97      0.97      2114
          25       0.99      1.00      0.99     18531

    accuracy                           0.96     94059
   macro avg       0.93      0.93      0.93     94059
weighted avg       0.96      0.96      0.96     94059

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (115590, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (87130, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
              precision    recall  f1-score   support

           0       0.83      0.88      0.85      1850
           1       0.96      0.51      0.67      2044
           2       0.97      0.96      0.97      2332
           3       0.99      1.00      0.99     22599
           4       0.76      0.62      0.68      1901
           5       0.48      0.41      0.44      1492
           6       0.97      0.80      0.88      2095
           7       0.81      0.96      0.88      2384
           8       0.94      0.97      0.95      2145
           9       0.96      0.84      0.89      2358
          10       0.95      0.97      0.96      2340
          11       0.94      0.94      0.94      2095
          12       0.98      0.97      0.98      2085
          13       0.82      0.99      0.89      2117
          14       0.94      0.89      0.92      1751
          15       0.97      0.91      0.94      1855
          16       0.67      0.50      0.58      1623
          17       0.66      0.71      0.69      1903
          18       0.87      0.98      0.92      2085
          19       0.98      0.91      0.95      2152
          20       0.65      0.92      0.76      1857
          21       0.80      0.54      0.64      1484
          22       0.89      0.94      0.92      1836
          23       0.84      0.94      0.89      2078
          24       0.95      0.98      0.96      2200
          25       0.95      0.99      0.97     16469

    accuracy                           0.91     87130
   macro avg       0.87      0.85      0.85     87130
weighted avg       0.92      0.91      0.91     87130


=== Overall Accuracy ===
0.9370611776259995 [0.9599825641352768, 0.9141397911167222]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.995060     0.908130     0.997009 0.871919 0.997938
        CLTC_WT_Untreated  0.993322     0.752402     0.999570 0.978421 0.993617
Calreticulin_WT_Untreated  0.998184     0.952618     0.999321 0.972235 0.998818
        DAPI_WT_Untreated  0.998471     0.999895     0.997967 0.994296 0.999963
       DCP1A_WT_Untreated  0.990336     0.731043     0.996519 0.833558 0.993605
        FMRP_WT_Untreated  0.987499     0.707561     0.993980 0.731283 0.993234
         FUS_WT_Untreated  0.996137     0.859617     0.999480 0.975885 0.996572
       G3BP1_WT_Untreated  0.995690     0.968595     0.996403 0.876411 0.999171
       GM130_WT_Untreated  0.998046     0.963572     0.998875 0.953710 0.999124
       KIF5A_WT_Untreated  0.994740     0.852757     0.998325 0.927822 0.996290
       LAMP1_WT_Untreated  0.997792     0.963125     0.998692 0.950269 0.999043
 MitoTracker_WT_Untreated  0.997892     0.958439     0.998858 0.953595 0.998982
         NCL_WT_Untreated  0.999200     0.980056     0.999666 0.986231 0.999514
        NEMO_WT_Untreated  0.995673     0.957338     0.996644 0.878459 0.998917
         P54_WT_Untreated  0.996926     0.906463     0.998795 0.939514 0.998069
       PEX14_WT_Untreated  0.997936     0.939425     0.999233 0.964463 0.998658
         PML_WT_Untreated  0.991125     0.687591     0.997001 0.816143 0.993970
       PSD95_WT_Untreated  0.991026     0.788908     0.995047 0.760087 0.995798
        PURA_WT_Untreated  0.997119     0.973892     0.997667 0.907792 0.999383
  Phalloidin_WT_Untreated  0.997660     0.914889     0.999610 0.982239 0.997998
        SNCA_WT_Untreated  0.992362     0.930952     0.993670 0.758078 0.998522
      SQSTM1_WT_Untreated  0.991285     0.700446     0.996411 0.774762 0.994730
       TDP43_WT_Untreated  0.995921     0.930239     0.997317 0.880492 0.998516
        TIA1_WT_Untreated  0.995496     0.943324     0.996724 0.871340 0.998664
      TOMM20_WT_Untreated  0.998383     0.975661     0.998937 0.957244 0.999406
        TUJ1_WT_Untreated  0.992593     0.990429     0.993112 0.971771 0.997698
            Macro Average  0.995226     0.893745     0.997494 0.902616 0.997546
Out[49]:
{'Accuracy': 0.9952259795020667,
 'Sensitivity': 0.8937447502194574,
 'Specificity': 0.9974935796886114,
 'PPV': 0.9026161965801902,
 'NPV': 0.9975461538416751}
In [50]:
## Baseline
run_baseline_model(
    dataset_config= pretrained_dataset_config,
    batches=[1, 2, 3],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=GaussianNB,
    classifier_kwargs={},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 17:37:47 INFO: [load_embeddings] multiplex=False
2025-08-20 17:37:47 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:37:47 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 17:37:47 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
Loading all batches...
2025-08-20 17:37:49 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:37:50 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:37:50 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:37:51 INFO: [load_embeddings] embeddings shape: (115590, 192)
2025-08-20 17:37:51 INFO: [load_embeddings] labels shape: (115590,)
2025-08-20 17:37:51 INFO: [load_embeddings] example label: CLTC_WT_Untreated
2025-08-20 17:37:51 INFO: [load_embeddings] paths shape: (115590,)
2025-08-20 17:37:51 INFO: [load_embeddings] multiplex=False
2025-08-20 17:37:51 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:37:51 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 17:37:51 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
2025-08-20 17:37:52 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:37:53 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:37:53 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:37:53 INFO: [load_embeddings] embeddings shape: (94059, 192)
2025-08-20 17:37:53 INFO: [load_embeddings] labels shape: (94059,)
2025-08-20 17:37:53 INFO: [load_embeddings] example label: DAPI_WT_Untreated
2025-08-20 17:37:53 INFO: [load_embeddings] paths shape: (94059,)
2025-08-20 17:37:53 INFO: [load_embeddings] multiplex=False
2025-08-20 17:37:53 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:37:53 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 17:37:53 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
2025-08-20 17:37:55 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:37:55 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:37:56 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:37:56 INFO: [load_embeddings] embeddings shape: (87130, 192)
2025-08-20 17:37:56 INFO: [load_embeddings] labels shape: (87130,)
2025-08-20 17:37:56 INFO: [load_embeddings] example label: MitoTracker_WT_Untreated
2025-08-20 17:37:56 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (115590, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (94059, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
              precision    recall  f1-score   support

           0       0.82      0.85      0.83      2123
           1       0.93      0.90      0.91      2536
           2       0.86      0.87      0.86      2079
           3       1.00      1.00      1.00     24823
           4       0.77      0.67      0.72      2319
           5       0.73      0.72      0.72      2608
           6       0.96      0.86      0.90      2236
           7       0.81      0.96      0.88      2265
           8       0.91      0.88      0.89      2110
           9       0.79      0.85      0.82      2104
          10       0.83      0.88      0.85      2243
          11       0.86      0.95      0.91      2236
          12       0.94      0.98      0.96      2227
          13       0.92      0.86      0.89      2360
          14       0.87      0.79      0.83      1916
          15       0.90      0.91      0.90      2074
          16       0.71      0.71      0.71      1818
          17       0.77      0.81      0.79      1631
          18       0.90      0.92      0.91      2090
          19       0.92      0.90      0.91      2019
          20       0.75      0.90      0.82      1923
          21       0.70      0.66      0.68      1654
          22       0.76      0.81      0.79      1934
          23       0.80      0.89      0.84      2086
          24       0.89      0.88      0.89      2114
          25       0.99      0.96      0.97     18531

    accuracy                           0.91     94059
   macro avg       0.85      0.86      0.85     94059
weighted avg       0.91      0.91      0.91     94059

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (115590, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (87130, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
              precision    recall  f1-score   support

           0       0.76      0.79      0.77      1850
           1       0.81      0.46      0.59      2044
           2       0.89      0.89      0.89      2332
           3       0.99      1.00      0.99     22599
           4       0.64      0.38      0.47      1901
           5       0.19      0.12      0.15      1492
           6       0.91      0.70      0.79      2095
           7       0.67      0.96      0.79      2384
           8       0.90      0.87      0.88      2145
           9       0.92      0.81      0.87      2358
          10       0.78      0.90      0.84      2340
          11       0.76      0.92      0.83      2095
          12       0.93      0.97      0.95      2085
          13       0.83      0.95      0.89      2117
          14       0.88      0.74      0.80      1751
          15       0.87      0.72      0.79      1855
          16       0.48      0.43      0.45      1623
          17       0.61      0.62      0.61      1903
          18       0.82      0.95      0.88      2085
          19       0.94      0.89      0.92      2152
          20       0.48      0.94      0.63      1857
          21       0.85      0.47      0.61      1484
          22       0.78      0.85      0.82      1836
          23       0.75      0.91      0.82      2078
          24       0.88      0.89      0.89      2200
          25       0.97      0.93      0.95     16469

    accuracy                           0.86     87130
   macro avg       0.78      0.77      0.76     87130
weighted avg       0.87      0.86      0.86     87130


=== Overall Accuracy ===
0.8862662487311861 [0.9105561402949213, 0.8619763571674509]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.991203     0.818273     0.995079 0.788504 0.995922
        CLTC_WT_Untreated  0.990226     0.700873     0.997729 0.888950 0.992285
Calreticulin_WT_Untreated  0.993918     0.880299     0.996753 0.871214 0.997012
        DAPI_WT_Untreated  0.997196     0.995087     0.997944 0.994206 0.998258
       DCP1A_WT_Untreated  0.984381     0.540047     0.994977 0.719381 0.989097
        FMRP_WT_Untreated  0.980744     0.499512     0.991885 0.587661 0.988453
         FUS_WT_Untreated  0.993388     0.779266     0.998632 0.933094 0.994616
       G3BP1_WT_Untreated  0.989972     0.961282     0.990727 0.731903 0.998972
       GM130_WT_Untreated  0.994818     0.872385     0.997762 0.903603 0.996934
       KIF5A_WT_Untreated  0.992290     0.829673     0.996396 0.853192 0.995703
       LAMP1_WT_Untreated  0.991710     0.890901     0.994326 0.802950 0.997161
 MitoTracker_WT_Untreated  0.993289     0.938582     0.994628 0.810568 0.998490
         NCL_WT_Untreated  0.997825     0.976809     0.998338 0.934754 0.999434
        NEMO_WT_Untreated  0.994497     0.904847     0.996769 0.876460 0.997587
         P54_WT_Untreated  0.993068     0.764930     0.997781 0.876837 0.995157
       PEX14_WT_Untreated  0.993846     0.817256     0.997760 0.889967 0.995957
         PML_WT_Untreated  0.984828     0.575414     0.992754 0.605875 0.991788
       PSD95_WT_Untreated  0.987935     0.707980     0.993504 0.684354 0.994187
        PURA_WT_Untreated  0.995060     0.937725     0.996413 0.860440 0.998528
  Phalloidin_WT_Untreated  0.996065     0.895708     0.998430 0.930742 0.997545
        SNCA_WT_Untreated  0.984651     0.920106     0.986027 0.583851 0.998277
      SQSTM1_WT_Untreated  0.989337     0.570108     0.996726 0.754216 0.992456
       TDP43_WT_Untreated  0.991374     0.828912     0.994826 0.772941 0.996359
        TIA1_WT_Untreated  0.991572     0.896494     0.993809 0.773038 0.997556
      TOMM20_WT_Untreated  0.994641     0.888503     0.997230 0.886653 0.997280
        TUJ1_WT_Untreated  0.986555     0.946114     0.996238 0.983662 0.987216
            Macro Average  0.991323     0.820657     0.995517 0.819193 0.995470
Out[50]:
{'Accuracy': 0.991322702982903,
 'Sensitivity': 0.8206572429223186,
 'Specificity': 0.9955169763968997,
 'PPV': 0.81919290735299,
 'NPV': 0.9954703683087419}
In [51]:
## Baseline
run_baseline_model(
    dataset_config= pretrained_dataset_config,
    batches=[1, 2, 3],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=RidgeClassifier,
    classifier_kwargs={},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 17:38:02 INFO: [load_embeddings] multiplex=False
2025-08-20 17:38:02 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:38:02 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 17:38:02 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
Loading all batches...
2025-08-20 17:38:04 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:38:05 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:38:05 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:38:06 INFO: [load_embeddings] embeddings shape: (115590, 192)
2025-08-20 17:38:06 INFO: [load_embeddings] labels shape: (115590,)
2025-08-20 17:38:06 INFO: [load_embeddings] example label: CLTC_WT_Untreated
2025-08-20 17:38:06 INFO: [load_embeddings] paths shape: (115590,)
2025-08-20 17:38:06 INFO: [load_embeddings] multiplex=False
2025-08-20 17:38:06 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:38:06 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 17:38:06 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
2025-08-20 17:38:07 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:38:07 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:38:08 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:38:08 INFO: [load_embeddings] embeddings shape: (94059, 192)
2025-08-20 17:38:08 INFO: [load_embeddings] labels shape: (94059,)
2025-08-20 17:38:08 INFO: [load_embeddings] example label: DAPI_WT_Untreated
2025-08-20 17:38:08 INFO: [load_embeddings] paths shape: (94059,)
2025-08-20 17:38:08 INFO: [load_embeddings] multiplex=False
2025-08-20 17:38:08 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:38:08 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 17:38:08 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
2025-08-20 17:38:09 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:38:10 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:38:10 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:38:11 INFO: [load_embeddings] embeddings shape: (87130, 192)
2025-08-20 17:38:11 INFO: [load_embeddings] labels shape: (87130,)
2025-08-20 17:38:11 INFO: [load_embeddings] example label: MitoTracker_WT_Untreated
2025-08-20 17:38:11 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (115590, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (94059, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
              precision    recall  f1-score   support

           0       0.83      0.86      0.85      2123
           1       0.98      0.82      0.89      2536
           2       0.88      0.88      0.88      2079
           3       0.99      1.00      0.99     24823
           4       0.89      0.72      0.80      2319
           5       0.79      0.84      0.81      2608
           6       0.95      0.90      0.92      2236
           7       0.93      0.95      0.94      2265
           8       0.93      0.94      0.94      2110
           9       0.86      0.83      0.85      2104
          10       0.89      0.94      0.92      2243
          11       0.96      0.95      0.96      2236
          12       0.99      0.97      0.98      2227
          13       0.89      0.93      0.91      2360
          14       0.84      0.90      0.87      1916
          15       0.93      0.94      0.94      2074
          16       0.92      0.66      0.77      1818
          17       0.87      0.83      0.84      1631
          18       0.90      0.95      0.93      2090
          19       0.97      0.80      0.88      2019
          20       0.84      0.91      0.87      1923
          21       0.78      0.74      0.76      1654
          22       0.83      0.80      0.81      1934
          23       0.87      0.89      0.88      2086
          24       0.95      0.93      0.94      2114
          25       0.95      1.00      0.97     18531

    accuracy                           0.93     94059
   macro avg       0.90      0.88      0.89     94059
weighted avg       0.93      0.93      0.93     94059

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (115590, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (87130, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
              precision    recall  f1-score   support

           0       0.80      0.77      0.78      1850
           1       0.93      0.30      0.45      2044
           2       0.95      0.92      0.93      2332
           3       0.97      1.00      0.99     22599
           4       0.77      0.39      0.52      1901
           5       0.31      0.27      0.29      1492
           6       0.93      0.74      0.82      2095
           7       0.81      0.91      0.86      2384
           8       0.92      0.94      0.93      2145
           9       0.93      0.78      0.85      2358
          10       0.90      0.95      0.92      2340
          11       0.92      0.92      0.92      2095
          12       0.98      0.96      0.97      2085
          13       0.69      0.99      0.81      2117
          14       0.84      0.85      0.85      1751
          15       0.94      0.89      0.91      1855
          16       0.66      0.40      0.50      1623
          17       0.63      0.69      0.66      1903
          18       0.83      0.97      0.89      2085
          19       0.97      0.83      0.90      2152
          20       0.55      0.82      0.66      1857
          21       0.79      0.38      0.51      1484
          22       0.85      0.87      0.86      1836
          23       0.80      0.88      0.84      2078
          24       0.90      0.95      0.92      2200
          25       0.91      0.98      0.94     16469

    accuracy                           0.88     87130
   macro avg       0.83      0.78      0.79     87130
weighted avg       0.88      0.88      0.87     87130


=== Overall Accuracy ===
0.9040429000423724 [0.9303628573554896, 0.8777229427292551]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.991992     0.820539     0.995836 0.815408 0.995976
        CLTC_WT_Untreated  0.988984     0.584934     0.999462 0.965753 0.989345
Calreticulin_WT_Untreated  0.995541     0.899796     0.997930 0.915571 0.997501
        DAPI_WT_Untreated  0.994906     1.000000     0.993100 0.980908 1.000000
       DCP1A_WT_Untreated  0.987665     0.573460     0.997542 0.847636 0.989907
        FMRP_WT_Untreated  0.983388     0.630976     0.991547 0.633448 0.991457
         FUS_WT_Untreated  0.994409     0.818518     0.998716 0.939820 0.995570
       G3BP1_WT_Untreated  0.994508     0.928157     0.996256 0.867162 0.998105
       GM130_WT_Untreated  0.996827     0.939365     0.998208 0.926518 0.998541
       KIF5A_WT_Untreated  0.992864     0.803227     0.997652 0.896224 0.995045
       LAMP1_WT_Untreated  0.995883     0.948287     0.997118 0.895160 0.998656
 MitoTracker_WT_Untreated  0.997053     0.939737     0.998456 0.937140 0.998524
         NCL_WT_Untreated  0.998874     0.964750     0.999706 0.987654 0.999141
        NEMO_WT_Untreated  0.992257     0.960241     0.993068 0.778240 0.998987
         P54_WT_Untreated  0.994111     0.875648     0.996558 0.840136 0.997429
       PEX14_WT_Untreated  0.996843     0.920845     0.998528 0.932715 0.998246
         PML_WT_Untreated  0.988835     0.538797     0.997547 0.809607 0.991129
       PSD95_WT_Untreated  0.989828     0.752405     0.994551 0.733113 0.995072
        PURA_WT_Untreated  0.995596     0.961198     0.996407 0.863196 0.999082
  Phalloidin_WT_Untreated  0.995182     0.813474     0.999463 0.972764 0.995622
        SNCA_WT_Untreated  0.988371     0.866667     0.990964 0.671449 0.997141
      SQSTM1_WT_Untreated  0.989751     0.568834     0.997169 0.779817 0.992437
       TDP43_WT_Untreated  0.993140     0.832361     0.996556 0.837023 0.996438
        TIA1_WT_Untreated  0.993294     0.884726     0.995848 0.833673 0.997285
      TOMM20_WT_Untreated  0.996711     0.941122     0.998066 0.922308 0.998563
        TUJ1_WT_Untreated  0.983288     0.989743     0.981743 0.928464 0.997505
            Macro Average  0.992696     0.836839     0.996077 0.865804 0.996258
Out[51]:
{'Accuracy': 0.9926961095023175,
 'Sensitivity': 0.8368386339245435,
 'Specificity': 0.9960768450269657,
 'PPV': 0.8658041113997191,
 'NPV': 0.9962578451271408}
In [52]:
## Baseline
run_baseline_model(
    dataset_config= pretrained_dataset_config,
    batches=[1, 2, 3],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=LinearSVC,
    classifier_kwargs={"C": 1.0, "max_iter": 1000, "random_state": 42},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 17:38:14 INFO: [load_embeddings] multiplex=False
2025-08-20 17:38:14 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:38:14 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 17:38:14 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
Loading all batches...
2025-08-20 17:38:16 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:38:17 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:38:17 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:38:17 INFO: [load_embeddings] embeddings shape: (115590, 192)
2025-08-20 17:38:17 INFO: [load_embeddings] labels shape: (115590,)
2025-08-20 17:38:17 INFO: [load_embeddings] example label: CLTC_WT_Untreated
2025-08-20 17:38:17 INFO: [load_embeddings] paths shape: (115590,)
2025-08-20 17:38:17 INFO: [load_embeddings] multiplex=False
2025-08-20 17:38:17 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:38:17 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 17:38:17 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
2025-08-20 17:38:19 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:38:19 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:38:20 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:38:20 INFO: [load_embeddings] embeddings shape: (94059, 192)
2025-08-20 17:38:20 INFO: [load_embeddings] labels shape: (94059,)
2025-08-20 17:38:20 INFO: [load_embeddings] example label: DAPI_WT_Untreated
2025-08-20 17:38:20 INFO: [load_embeddings] paths shape: (94059,)
2025-08-20 17:38:20 INFO: [load_embeddings] multiplex=False
2025-08-20 17:38:20 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:38:20 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 17:38:20 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
2025-08-20 17:38:21 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:38:22 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:38:22 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:38:22 INFO: [load_embeddings] embeddings shape: (87130, 192)
2025-08-20 17:38:22 INFO: [load_embeddings] labels shape: (87130,)
2025-08-20 17:38:22 INFO: [load_embeddings] example label: MitoTracker_WT_Untreated
2025-08-20 17:38:22 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (115590, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (94059, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.
  warnings.warn(
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      2123
           1       0.99      0.96      0.97      2536
           2       0.97      0.96      0.97      2079
           3       1.00      1.00      1.00     24823
           4       0.91      0.85      0.88      2319
           5       0.87      0.90      0.88      2608
           6       0.98      0.94      0.96      2236
           7       0.97      0.98      0.98      2265
           8       0.97      0.98      0.98      2110
           9       0.91      0.87      0.89      2104
          10       0.97      0.97      0.97      2243
          11       0.97      0.98      0.97      2236
          12       0.99      1.00      1.00      2227
          13       0.96      0.94      0.95      2360
          14       0.94      0.94      0.94      1916
          15       0.97      0.97      0.97      2074
          16       0.93      0.89      0.91      1818
          17       0.89      0.88      0.89      1631
          18       0.96      0.98      0.97      2090
          19       0.99      0.94      0.96      2019
          20       0.91      0.95      0.93      1923
          21       0.77      0.85      0.81      1654
          22       0.89      0.92      0.90      1934
          23       0.92      0.95      0.93      2086
          24       0.98      0.98      0.98      2114
          25       0.99      1.00      0.99     18531

    accuracy                           0.97     94059
   macro avg       0.94      0.94      0.94     94059
weighted avg       0.97      0.97      0.97     94059

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (115590, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (87130, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.
  warnings.warn(
              precision    recall  f1-score   support

           0       0.88      0.89      0.89      1850
           1       0.96      0.55      0.70      2044
           2       0.97      0.98      0.97      2332
           3       1.00      1.00      1.00     22599
           4       0.78      0.66      0.72      1901
           5       0.48      0.38      0.42      1492
           6       0.98      0.89      0.93      2095
           7       0.82      0.97      0.89      2384
           8       0.93      0.97      0.95      2145
           9       0.97      0.82      0.89      2358
          10       0.97      0.97      0.97      2340
          11       0.95      0.96      0.96      2095
          12       0.97      0.99      0.98      2085
          13       0.84      0.99      0.91      2117
          14       0.94      0.91      0.93      1751
          15       0.97      0.92      0.94      1855
          16       0.67      0.55      0.60      1623
          17       0.64      0.72      0.68      1903
          18       0.88      0.99      0.93      2085
          19       0.98      0.94      0.96      2152
          20       0.67      0.94      0.78      1857
          21       0.74      0.48      0.58      1484
          22       0.91      0.94      0.93      1836
          23       0.85      0.95      0.90      2078
          24       0.96      0.99      0.98      2200
          25       0.96      0.99      0.98     16469

    accuracy                           0.92     87130
   macro avg       0.87      0.86      0.86     87130
weighted avg       0.92      0.92      0.92     87130


=== Overall Accuracy ===
0.9445453416512052 [0.9668824886507404, 0.9222081946516699]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.996313     0.926001     0.997890 0.907723 0.998340
        CLTC_WT_Untreated  0.993940     0.776856     0.999570 0.979086 0.994244
Calreticulin_WT_Untreated  0.998587     0.968488     0.999338 0.973342 0.999214
        DAPI_WT_Untreated  0.999404     0.999937     0.999215 0.997791 0.999978
       DCP1A_WT_Untreated  0.991407     0.763507     0.996841 0.852156 0.994375
        FMRP_WT_Untreated  0.988040     0.708293     0.994517 0.749419 0.993255
         FUS_WT_Untreated  0.997494     0.914108     0.999536 0.979708 0.997900
       G3BP1_WT_Untreated  0.996275     0.978920     0.996732 0.887480 0.999443
       GM130_WT_Untreated  0.998306     0.976028     0.998841 0.952960 0.999423
       KIF5A_WT_Untreated  0.994851     0.843568     0.998670 0.941235 0.996061
       LAMP1_WT_Untreated  0.998444     0.971634     0.999139 0.966992 0.999264
 MitoTracker_WT_Untreated  0.998377     0.972062     0.999022 0.960529 0.999316
         NCL_WT_Untreated  0.999393     0.991187     0.999593 0.983433 0.999785
        NEMO_WT_Untreated  0.996407     0.961358     0.997295 0.900042 0.999019
         P54_WT_Untreated  0.997351     0.923643     0.998873 0.944243 0.998423
       PEX14_WT_Untreated  0.998273     0.946806     0.999413 0.972803 0.998822
         PML_WT_Untreated  0.991749     0.727114     0.996872 0.818182 0.994729
       PSD95_WT_Untreated  0.990877     0.796265     0.994748 0.751001 0.995942
        PURA_WT_Untreated  0.997583     0.982275     0.997944 0.918477 0.999581
  Phalloidin_WT_Untreated  0.998256     0.939343     0.999644 0.984175 0.998572
        SNCA_WT_Untreated  0.993079     0.945238     0.994098 0.773377 0.998828
      SQSTM1_WT_Untreated  0.990756     0.678776     0.996254 0.761530 0.994350
       TDP43_WT_Untreated  0.996396     0.929178     0.997824 0.900746 0.998494
        TIA1_WT_Untreated  0.995982     0.950048     0.997063 0.883825 0.998823
      TOMM20_WT_Untreated  0.999018     0.986555     0.999322 0.972578 0.999672
        TUJ1_WT_Untreated  0.994244     0.991829     0.994822 0.978659 0.998037
            Macro Average  0.995800     0.905731     0.997811 0.911211 0.997842
Out[52]:
{'Accuracy': 0.9957999657815871,
 'Sensitivity': 0.9057314282550202,
 'Specificity': 0.9978106343982152,
 'PPV': 0.9112112336388931,
 'NPV': 0.9978418992101846}
In [53]:
run_baseline_model(
    pretrained_dataset_config,
    batches=[1,2,3,],
    classifier_class=cuRF,
    classifier_kwargs={"n_estimators": 300, "random_state": 42}, 
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 17:39:02 INFO: [load_embeddings] multiplex=False
2025-08-20 17:39:02 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:39:02 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 17:39:02 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
Loading all batches...
2025-08-20 17:39:04 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:39:05 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:39:06 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:39:06 INFO: [load_embeddings] embeddings shape: (115590, 192)
2025-08-20 17:39:06 INFO: [load_embeddings] labels shape: (115590,)
2025-08-20 17:39:06 INFO: [load_embeddings] example label: CLTC_WT_Untreated
2025-08-20 17:39:06 INFO: [load_embeddings] paths shape: (115590,)
2025-08-20 17:39:06 INFO: [load_embeddings] multiplex=False
2025-08-20 17:39:06 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:39:06 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 17:39:06 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
2025-08-20 17:39:07 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:39:08 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:39:08 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:39:08 INFO: [load_embeddings] embeddings shape: (94059, 192)
2025-08-20 17:39:08 INFO: [load_embeddings] labels shape: (94059,)
2025-08-20 17:39:08 INFO: [load_embeddings] example label: DAPI_WT_Untreated
2025-08-20 17:39:08 INFO: [load_embeddings] paths shape: (94059,)
2025-08-20 17:39:08 INFO: [load_embeddings] multiplex=False
2025-08-20 17:39:08 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:39:08 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 17:39:08 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
2025-08-20 17:39:10 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:39:10 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:39:11 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:39:11 INFO: [load_embeddings] embeddings shape: (87130, 192)
2025-08-20 17:39:11 INFO: [load_embeddings] labels shape: (87130,)
2025-08-20 17:39:11 INFO: [load_embeddings] example label: MitoTracker_WT_Untreated
2025-08-20 17:39:11 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (115590, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (94059, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/cuml/internals/api_decorators.py:344: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
  return func(**kwargs)
              precision    recall  f1-score   support

           0       0.89      0.92      0.91      2123
           1       0.98      0.89      0.94      2536
           2       0.94      0.92      0.93      2079
           3       1.00      1.00      1.00     24823
           4       0.86      0.74      0.80      2319
           5       0.79      0.85      0.81      2608
           6       0.98      0.90      0.94      2236
           7       0.95      0.97      0.96      2265
           8       0.96      0.95      0.96      2110
           9       0.87      0.87      0.87      2104
          10       0.89      0.95      0.92      2243
          11       0.96      0.96      0.96      2236
          12       0.99      0.98      0.99      2227
          13       0.95      0.92      0.93      2360
          14       0.92      0.88      0.90      1916
          15       0.95      0.94      0.95      2074
          16       0.89      0.80      0.85      1818
          17       0.88      0.84      0.86      1631
          18       0.93      0.96      0.94      2090
          19       0.98      0.87      0.92      2019
          20       0.86      0.93      0.89      1923
          21       0.76      0.78      0.77      1654
          22       0.83      0.90      0.86      1934
          23       0.88      0.93      0.91      2086
          24       0.97      0.94      0.95      2114
          25       0.97      0.99      0.98     18531

    accuracy                           0.95     94059
   macro avg       0.92      0.91      0.91     94059
weighted avg       0.95      0.95      0.95     94059

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (115590, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (87130, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/cuml/internals/api_decorators.py:344: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
  return func(**kwargs)
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      1850
           1       0.93      0.38      0.54      2044
           2       0.95      0.95      0.95      2332
           3       0.98      1.00      0.99     22599
           4       0.71      0.50      0.59      1901
           5       0.32      0.26      0.29      1492
           6       0.94      0.76      0.84      2095
           7       0.82      0.92      0.87      2384
           8       0.94      0.94      0.94      2145
           9       0.96      0.83      0.89      2358
          10       0.88      0.96      0.92      2340
          11       0.94      0.92      0.93      2095
          12       0.98      0.98      0.98      2085
          13       0.80      0.98      0.88      2117
          14       0.93      0.87      0.90      1751
          15       0.96      0.84      0.90      1855
          16       0.63      0.48      0.55      1623
          17       0.70      0.66      0.68      1903
          18       0.85      0.97      0.91      2085
          19       0.97      0.89      0.93      2152
          20       0.57      0.92      0.70      1857
          21       0.89      0.61      0.72      1484
          22       0.87      0.93      0.90      1836
          23       0.83      0.95      0.88      2078
          24       0.95      0.95      0.95      2200
          25       0.93      0.99      0.96     16469

    accuracy                           0.90     87130
   macro avg       0.85      0.82      0.82     87130
weighted avg       0.90      0.90      0.89     87130


=== Overall Accuracy ===
0.9220545504680028 [0.9459913458573874, 0.8981177550786181]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.994757     0.889756     0.997111 0.873487 0.997527
        CLTC_WT_Untreated  0.990943     0.663100     0.999445 0.968740 0.991334
Calreticulin_WT_Untreated  0.997080     0.933575     0.998665 0.945797 0.998343
        DAPI_WT_Untreated  0.997356     0.999937     0.996442 0.990062 0.999977
       DCP1A_WT_Untreated  0.987759     0.633886     0.996197 0.798984 0.991312
        FMRP_WT_Untreated  0.983895     0.632195     0.992038 0.647676 0.991489
         FUS_WT_Untreated  0.995248     0.833988     0.999197 0.962174 0.995948
       G3BP1_WT_Untreated  0.995314     0.944719     0.996647 0.881220 0.998541
       GM130_WT_Untreated  0.997583     0.946886     0.998802 0.950012 0.998723
       KIF5A_WT_Untreated  0.994459     0.852981     0.998031 0.916225 0.996295
       LAMP1_WT_Untreated  0.995723     0.955706     0.996761 0.884491 0.998848
 MitoTracker_WT_Untreated  0.997378     0.939506     0.998796 0.950257 0.998519
         NCL_WT_Untreated  0.999244     0.979128     0.999734 0.988990 0.999491
        NEMO_WT_Untreated  0.995226     0.949073     0.996395 0.869628 0.998707
         P54_WT_Untreated  0.995966     0.872375     0.998518 0.924032 0.997367
       PEX14_WT_Untreated  0.996788     0.896411     0.999013 0.952664 0.997707
         PML_WT_Untreated  0.989878     0.653298     0.996394 0.778124 0.993309
       PSD95_WT_Untreated  0.990987     0.739672     0.995987 0.785693 0.994827
        PURA_WT_Untreated  0.996473     0.965030     0.997215 0.890977 0.999174
  Phalloidin_WT_Untreated  0.996689     0.878207     0.999480 0.975499 0.997137
        SNCA_WT_Untreated  0.989519     0.924074     0.990914 0.684231 0.998370
      SQSTM1_WT_Untreated  0.991898     0.701083     0.997023 0.805861 0.994744
       TDP43_WT_Untreated  0.994845     0.913528     0.996573 0.849951 0.998160
        TIA1_WT_Untreated  0.994933     0.937800     0.996277 0.855609 0.998534
      TOMM20_WT_Untreated  0.997831     0.946222     0.999090 0.962055 0.998689
        TUJ1_WT_Untreated  0.988167     0.991371     0.987400 0.949589 0.997912
            Macro Average  0.994075     0.868212     0.996852 0.886232 0.996961
Out[53]:
{'Accuracy': 0.9940746105745085,
 'Sensitivity': 0.8682118440730283,
 'Specificity': 0.9968516972810667,
 'PPV': 0.8862318807161186,
 'NPV': 0.9969609440727576}
In [54]:
run_baseline_model(pretrained_dataset_config,
    batches=[1,2,3],
    classifier_class=ExtraTreesClassifier,
    classifier_kwargs={"n_estimators": 300, "max_depth": None, "min_samples_leaf": 1,
                                     "n_jobs": -1, "random_state": 42},
    train_specific_batches = [1],
    results_csv = 'classification_results-NIH.csv'
)
2025-08-20 17:39:48 INFO: [load_embeddings] multiplex=False
2025-08-20 17:39:48 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:39:48 INFO: [load_embeddings] input_folders = ['batch1']
2025-08-20 17:39:48 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
Loading all batches...
2025-08-20 17:39:50 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:39:51 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:39:51 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:39:52 INFO: [load_embeddings] embeddings shape: (115590, 192)
2025-08-20 17:39:52 INFO: [load_embeddings] labels shape: (115590,)
2025-08-20 17:39:52 INFO: [load_embeddings] example label: CLTC_WT_Untreated
2025-08-20 17:39:52 INFO: [load_embeddings] paths shape: (115590,)
2025-08-20 17:39:52 INFO: [load_embeddings] multiplex=False
2025-08-20 17:39:52 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:39:52 INFO: [load_embeddings] input_folders = ['batch2']
2025-08-20 17:39:52 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
2025-08-20 17:39:53 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:39:54 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:39:54 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:39:54 INFO: [load_embeddings] embeddings shape: (94059, 192)
2025-08-20 17:39:54 INFO: [load_embeddings] labels shape: (94059,)
2025-08-20 17:39:54 INFO: [load_embeddings] example label: DAPI_WT_Untreated
2025-08-20 17:39:54 INFO: [load_embeddings] paths shape: (94059,)
2025-08-20 17:39:54 INFO: [load_embeddings] multiplex=False
2025-08-20 17:39:54 INFO: [load_embeddings] experiment_type = NIH
2025-08-20 17:39:54 INFO: [load_embeddings] input_folders = ['batch3']
2025-08-20 17:39:54 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
2025-08-20 17:39:56 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41']
2025-08-20 17:39:56 INFO: [embeddings_utils._filter] cell_lines = ['WT']
2025-08-20 17:39:56 INFO: [embeddings_utils._filter] conditions = ['Untreated']
2025-08-20 17:39:57 INFO: [load_embeddings] embeddings shape: (87130, 192)
2025-08-20 17:39:57 INFO: [load_embeddings] labels shape: (87130,)
2025-08-20 17:39:57 INFO: [load_embeddings] example label: MitoTracker_WT_Untreated
2025-08-20 17:39:57 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].

=== Batch [2] ===
Train: (115590, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (94059, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      2123
           1       0.98      0.91      0.95      2536
           2       0.95      0.92      0.94      2079
           3       1.00      1.00      1.00     24823
           4       0.88      0.76      0.81      2319
           5       0.81      0.86      0.83      2608
           6       0.98      0.92      0.95      2236
           7       0.94      0.98      0.96      2265
           8       0.96      0.96      0.96      2110
           9       0.88      0.88      0.88      2104
          10       0.90      0.95      0.93      2243
          11       0.96      0.96      0.96      2236
          12       1.00      0.98      0.99      2227
          13       0.95      0.93      0.94      2360
          14       0.92      0.89      0.90      1916
          15       0.96      0.94      0.95      2074
          16       0.91      0.82      0.86      1818
          17       0.89      0.85      0.87      1631
          18       0.93      0.96      0.95      2090
          19       0.98      0.87      0.92      2019
          20       0.86      0.93      0.90      1923
          21       0.77      0.80      0.78      1654
          22       0.85      0.90      0.87      1934
          23       0.90      0.93      0.92      2086
          24       0.97      0.94      0.96      2114
          25       0.97      1.00      0.98     18531

    accuracy                           0.95     94059
   macro avg       0.92      0.91      0.92     94059
weighted avg       0.95      0.95      0.95     94059

Training on Batches: [1], Testing on: [3].

=== Batch [3] ===
Train: (115590, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Test: (87130, 192) Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      1850
           1       0.93      0.37      0.53      2044
           2       0.95      0.95      0.95      2332
           3       0.99      1.00      0.99     22599
           4       0.73      0.50      0.59      1901
           5       0.37      0.31      0.34      1492
           6       0.95      0.78      0.85      2095
           7       0.81      0.95      0.88      2384
           8       0.94      0.95      0.94      2145
           9       0.97      0.84      0.90      2358
          10       0.88      0.96      0.92      2340
          11       0.94      0.92      0.93      2095
          12       0.98      0.98      0.98      2085
          13       0.81      0.99      0.89      2117
          14       0.92      0.87      0.90      1751
          15       0.96      0.84      0.90      1855
          16       0.64      0.48      0.55      1623
          17       0.70      0.69      0.69      1903
          18       0.86      0.98      0.92      2085
          19       0.98      0.88      0.93      2152
          20       0.58      0.92      0.71      1857
          21       0.88      0.59      0.70      1484
          22       0.87      0.92      0.90      1836
          23       0.84      0.95      0.89      2078
          24       0.95      0.96      0.96      2200
          25       0.93      0.99      0.96     16469

    accuracy                           0.90     87130
   macro avg       0.85      0.82      0.83     87130
weighted avg       0.90      0.90      0.90     87130


=== Overall Accuracy ===
0.9257885153576774 [0.9502227325402142, 0.9013542981751406]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
      ANAX11_WT_Untreated  0.994922     0.901334     0.997021 0.871502 0.997786
        CLTC_WT_Untreated  0.991147     0.669432     0.999490 0.971483 0.991496
Calreticulin_WT_Untreated  0.997213     0.934482     0.998778 0.950207 0.998366
        DAPI_WT_Untreated  0.997721     0.999958     0.996927 0.991407 0.999985
       DCP1A_WT_Untreated  0.988437     0.642180     0.996694 0.822458 0.991512
        FMRP_WT_Untreated  0.985054     0.658780     0.992608 0.673566 0.992104
         FUS_WT_Untreated  0.995778     0.853383     0.999265 0.966022 0.996420
       G3BP1_WT_Untreated  0.995469     0.963003     0.996324 0.873391 0.999023
       GM130_WT_Untreated  0.997676     0.951586     0.998785 0.949578 0.998836
       KIF5A_WT_Untreated  0.994834     0.858359     0.998280 0.926463 0.996430
       LAMP1_WT_Untreated  0.995971     0.958979     0.996931 0.890217 0.998933
 MitoTracker_WT_Untreated  0.997489     0.941122     0.998869 0.953227 0.998559
         NCL_WT_Untreated  0.999283     0.979592     0.999763 0.990155 0.999503
        NEMO_WT_Untreated  0.995496     0.955551     0.996508 0.873953 0.998871
         P54_WT_Untreated  0.996070     0.883283     0.998400 0.919387 0.997591
       PEX14_WT_Untreated  0.996904     0.895139     0.999159 0.959356 0.997679
         PML_WT_Untreated  0.990253     0.658529     0.996675 0.793140 0.993411
       PSD95_WT_Untreated  0.991291     0.762875     0.995835 0.784633 0.995286
        PURA_WT_Untreated  0.996722     0.969102     0.997373 0.896919 0.999270
  Phalloidin_WT_Untreated  0.996771     0.876528     0.999605 0.981213 0.997098
        SNCA_WT_Untreated  0.989911     0.924868     0.991297 0.693651 0.998388
      SQSTM1_WT_Untreated  0.991953     0.697897     0.997136 0.811111 0.994689
       TDP43_WT_Untreated  0.995049     0.907692     0.996906 0.861748 0.998036
        TIA1_WT_Untreated  0.995408     0.941643     0.996673 0.869401 0.998625
      TOMM20_WT_Untreated  0.997892     0.951089     0.999033 0.959991 0.998807
        TUJ1_WT_Untreated  0.988730     0.992429     0.987845 0.951331 0.998168
            Macro Average  0.994363     0.874185     0.997007 0.891750 0.997110
Out[54]:
{'Accuracy': 0.9943633018985275,
 'Sensitivity': 0.8741852119354661,
 'Specificity': 0.9970069021537664,
 'PPV': 0.8917503692831067,
 'NPV': 0.9971104454643182}
In [ ]: